#define __gen_address_offset anv_address_add
#include "common/gen_mi_builder.h"
+static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t pipeline);
+
static void
emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
{
genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_device *device = cmd_buffer->device;
+ UNUSED const struct gen_device_info *devinfo = &device->info;
uint32_t mocs = device->isl_dev.mocs.internal;
/* If we are emitting a new state base address we probably need to re-emit
pc.CommandStreamerStallEnable = true;
#if GEN_GEN >= 12
pc.TileCacheFlushEnable = true;
+#endif
+#if GEN_GEN == 12
+ /* GEN:BUG:1606662791:
+ *
+ * Software must program PIPE_CONTROL command with "HDC Pipeline
+ * Flush" prior to programming of the below two non-pipeline state :
+ * * STATE_BASE_ADDRESS
+ * * 3DSTATE_BINDING_TABLE_POOL_ALLOC
+ */
+ if (devinfo->revision == 0 /* A0 */)
+ pc.HDCPipelineFlushEnable = true;
#endif
}
+#if GEN_GEN == 12
+ /* GEN:BUG:1607854226:
+ *
+ * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
+ * mode by putting the pipeline temporarily in 3D mode.
+ */
+ uint32_t gen12_wa_pipeline = cmd_buffer->state.current_pipeline;
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+#endif
+
anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
sba.GeneralStateMOCS = mocs;
* these fields. However, since we will be growing the BO's live, we
* just set them all to the maximum.
*/
- sba.GeneralStateBufferSize = 0xfffff;
+ sba.GeneralStateBufferSize = 0xfffff;
+ sba.IndirectObjectBufferSize = 0xfffff;
+ if (device->physical->use_softpin) {
+ /* With softpin, we use fixed addresses so we actually know how big
+ * our base addresses are.
+ */
+ sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
+ sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
+ } else {
+ sba.DynamicStateBufferSize = 0xfffff;
+ sba.InstructionBufferSize = 0xfffff;
+ }
sba.GeneralStateBufferSizeModifyEnable = true;
- sba.DynamicStateBufferSize = 0xfffff;
- sba.DynamicStateBufferSizeModifyEnable = true;
- sba.IndirectObjectBufferSize = 0xfffff;
sba.IndirectObjectBufferSizeModifyEnable = true;
- sba.InstructionBufferSize = 0xfffff;
+ sba.DynamicStateBufferSizeModifyEnable = true;
sba.InstructionBuffersizeModifyEnable = true;
# else
/* On gen7, we have upper bounds instead. According to the docs,
sba.InstructionAccessUpperBoundModifyEnable = true;
# endif
# if (GEN_GEN >= 9)
- if (cmd_buffer->device->instance->physicalDevice.use_softpin) {
+ if (cmd_buffer->device->physical->use_softpin) {
sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
.bo = device->surface_state_pool.block_pool.bo,
.offset = 0,
# endif
}
+#if GEN_GEN == 12
+ /* GEN:BUG:1607854226:
+ *
+ * Put the pipeline back into its current mode.
+ */
+ if (gen12_wa_pipeline != UINT32_MAX)
+ genX(flush_pipeline_select)(cmd_buffer, gen12_wa_pipeline);
+#endif
+
/* After re-setting the surface state base address, we have to do some
* cache flusing so that the sampler engine will pick up the new
* SURFACE_STATE objects and binding tables. From the Broadwell PRM,
att_state->aux_usage =
anv_layout_to_aux_usage(&device->info, iview->image,
VK_IMAGE_ASPECT_COLOR_BIT,
+ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);
/* If we don't have aux, then we should have returned early in the layer
*/
if (cmd_state->pass->attachments[att].first_subpass_layout ==
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
- anv_perf_warn(device->instance, iview->image,
+ anv_perf_warn(device, iview->image,
"Not temporarily enabling CCS_E.");
}
} else {
if (att_state->fast_clear &&
(iview->planes[0].isl.base_level > 0 ||
iview->planes[0].isl.base_array_layer > 0)) {
- anv_perf_warn(device->instance, iview->image,
+ anv_perf_warn(device, iview->image,
"Rendering with multi-lod or multi-layer framebuffer "
"with LOAD_OP_LOAD and baseMipLevel > 0 or "
"baseArrayLayer > 0. Not fast clearing.");
att_state->fast_clear = false;
} else if (att_state->fast_clear && cmd_state->framebuffer->layers > 1) {
- anv_perf_warn(device->instance, iview->image,
+ anv_perf_warn(device, iview->image,
"Rendering to a multi-layer framebuffer with "
"LOAD_OP_CLEAR. Only fast-clearing the first slice");
}
att_state->aux_usage = ISL_AUX_USAGE_NONE;
att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
+ /* This is unused for depth/stencil but valgrind complains if it
+ * isn't initialized
+ */
+ att_state->clear_color_is_zero_one = false;
+
if (GEN_GEN == 7) {
/* We don't do any HiZ or depth fast-clears on gen7 yet */
att_state->fast_clear = false;
const enum isl_aux_usage first_subpass_aux_usage =
anv_layout_to_aux_usage(&device->info, iview->image,
VK_IMAGE_ASPECT_DEPTH_BIT,
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
pass_att->first_subpass_layout);
if (!blorp_can_hiz_clear_depth(&device->info,
&iview->image->planes[0].surface.isl,
VkImageLayout initial_layout,
VkImageLayout final_layout)
{
- const bool hiz_enabled = ISL_AUX_USAGE_HIZ ==
- anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
- VK_IMAGE_ASPECT_DEPTH_BIT, initial_layout);
- const bool enable_hiz = ISL_AUX_USAGE_HIZ ==
- anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
- VK_IMAGE_ASPECT_DEPTH_BIT, final_layout);
-
- enum isl_aux_op hiz_op;
- if (hiz_enabled && !enable_hiz) {
- hiz_op = ISL_AUX_OP_FULL_RESOLVE;
- } else if (!hiz_enabled && enable_hiz) {
- hiz_op = ISL_AUX_OP_AMBIGUATE;
- } else {
- assert(hiz_enabled == enable_hiz);
- /* If the same buffer will be used, no resolves are necessary. */
- hiz_op = ISL_AUX_OP_NONE;
- }
+ uint32_t depth_plane =
+ anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT);
+ if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
+ return;
- if (hiz_op != ISL_AUX_OP_NONE)
+ const enum isl_aux_state initial_state =
+ anv_layout_to_aux_state(&cmd_buffer->device->info, image,
+ VK_IMAGE_ASPECT_DEPTH_BIT,
+ initial_layout);
+ const enum isl_aux_state final_state =
+ anv_layout_to_aux_state(&cmd_buffer->device->info, image,
+ VK_IMAGE_ASPECT_DEPTH_BIT,
+ final_layout);
+
+ const bool initial_depth_valid =
+ isl_aux_state_has_valid_primary(initial_state);
+ const bool initial_hiz_valid =
+ isl_aux_state_has_valid_aux(initial_state);
+ const bool final_needs_depth =
+ isl_aux_state_has_valid_primary(final_state);
+ const bool final_needs_hiz =
+ isl_aux_state_has_valid_aux(final_state);
+
+ /* Getting into the pass-through state for Depth is tricky and involves
+ * both a resolve and an ambiguate. We don't handle that state right now
+ * as anv_layout_to_aux_state never returns it.
+ */
+ assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
+
+ if (final_needs_depth && !initial_depth_valid) {
+ assert(initial_hiz_valid);
anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
- 0, 0, 1, hiz_op);
+ 0, 0, 1, ISL_AUX_OP_FULL_RESOLVE);
+ } else if (final_needs_hiz && !initial_hiz_valid) {
+ assert(initial_depth_valid);
+ anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
+ 0, 0, 1, ISL_AUX_OP_AMBIGUATE);
+ }
}
static inline bool
* to do a partial resolve on a CCS_D surface.
*/
if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
- image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
+ image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
resolve_op = ISL_AUX_OP_FULL_RESOLVE;
anv_image_ccs_op(cmd_buffer, image, format, aspect, level,
}
}
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+#if GEN_GEN == 12
+static void
+anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_image *image,
+ VkImageAspectFlagBits aspect,
+ uint32_t base_level, uint32_t level_count,
+ uint32_t base_layer, uint32_t layer_count)
+{
+ uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);
+
+ uint64_t base_address =
+ anv_address_physical(image->planes[plane].address);
+
+ const struct isl_surf *isl_surf = &image->planes[plane].surface.isl;
+ uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf);
+
+ /* We're about to live-update the AUX-TT. We really don't want anyone else
+ * trying to read it while we're doing this. We could probably get away
+ * with not having this stall in some cases if we were really careful but
+ * it's better to play it safe. Full stall the GPU.
+ */
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ struct gen_mi_builder b;
+ gen_mi_builder_init(&b, &cmd_buffer->batch);
+
+ for (uint32_t a = 0; a < layer_count; a++) {
+ const uint32_t layer = base_layer + a;
+
+ uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
+ for (uint32_t l = 0; l < level_count; l++) {
+ const uint32_t level = base_level + l;
+
+ uint32_t logical_array_layer, logical_z_offset_px;
+ if (image->type == VK_IMAGE_TYPE_3D) {
+ logical_array_layer = 0;
+
+ /* If the given miplevel does not have this layer, then any higher
+ * miplevels won't either because miplevels only get smaller the
+ * higher the LOD.
+ */
+ assert(layer < image->extent.depth);
+ if (layer >= anv_minify(image->extent.depth, level))
+ break;
+ logical_z_offset_px = layer;
+ } else {
+ assert(layer < image->array_size);
+ logical_array_layer = layer;
+ logical_z_offset_px = 0;
+ }
+
+ uint32_t slice_start_offset_B, slice_end_offset_B;
+ isl_surf_get_image_range_B_tile(isl_surf, level,
+ logical_array_layer,
+ logical_z_offset_px,
+ &slice_start_offset_B,
+ &slice_end_offset_B);
+
+ start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
+ end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
+ }
+
+ /* Aux operates 64K at a time */
+ start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
+ end_offset_B = align_u64(end_offset_B, 64 * 1024);
+
+ for (uint64_t offset = start_offset_B;
+ offset < end_offset_B; offset += 64 * 1024) {
+ uint64_t address = base_address + offset;
+
+ uint64_t aux_entry_addr64, *aux_entry_map;
+ aux_entry_map = gen_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
+ address, &aux_entry_addr64);
+
+ assert(cmd_buffer->device->physical->use_softpin);
+ struct anv_address aux_entry_address = {
+ .bo = NULL,
+ .offset = aux_entry_addr64,
+ };
+
+ const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
+ uint64_t new_aux_entry =
+ (old_aux_entry & GEN_AUX_MAP_ADDRESS_MASK) | format_bits;
+
+ if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
+ new_aux_entry |= GEN_AUX_MAP_ENTRY_VALID_BIT;
+
+ gen_mi_store(&b, gen_mi_mem64(aux_entry_address),
+ gen_mi_imm(new_aux_entry));
+ }
+ }
+
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
+}
+#endif /* GEN_GEN == 12 */
+
/**
* @brief Transitions a color buffer from one layout to another.
*
VkImageLayout initial_layout,
VkImageLayout final_layout)
{
- const struct gen_device_info *devinfo = &cmd_buffer->device->info;
+ struct anv_device *device = cmd_buffer->device;
+ const struct gen_device_info *devinfo = &device->info;
/* Validate the inputs. */
assert(cmd_buffer);
assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
return;
- assert(image->tiling == VK_IMAGE_TILING_OPTIMAL);
+ assert(image->planes[plane].surface.isl.tiling != ISL_TILING_LINEAR);
if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
+#if GEN_GEN == 12
+ if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
+ anv_image_init_aux_tt(cmd_buffer, image, aspect,
+ base_level, level_count,
+ base_layer, layer_count);
+ }
+#else
+ assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
+#endif
+
/* A subresource in the undefined layout may have been aliased and
* populated with any arrangement of bits. Therefore, we must initialize
* the related aux buffer and clear buffer entry with desirable values.
}
} else {
if (image->samples == 4 || image->samples == 16) {
- anv_perf_warn(cmd_buffer->device->instance, image,
+ anv_perf_warn(cmd_buffer->device, image,
"Doing a potentially unnecessary fast-clear to "
"define an MCS buffer.");
}
}
const enum isl_aux_usage initial_aux_usage =
- anv_layout_to_aux_usage(devinfo, image, aspect, initial_layout);
+ anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
const enum isl_aux_usage final_aux_usage =
- anv_layout_to_aux_usage(devinfo, image, aspect, final_layout);
+ anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
/* The current code assumes that there is no mixing of CCS_E and CCS_D.
* We can handle transitions between CCS_D/E to and from NONE. What we
* executing anything. The chances are fairly high that they will use
* blorp at least once per primary command buffer so it shouldn't be
* wasted.
+ *
+ * There is also a workaround on gen8 which requires us to invalidate the
+ * VF cache occasionally. It's easier if we can assume we start with a
+ * fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).)
+ */
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+
+ /* Re-emit the aux table register in every command buffer. This way we're
+ * ensured that we have the table even if this command buffer doesn't
+ * initialize any images.
*/
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
/* We send an "Indirect State Pointers Disable" packet at
* EndCommandBuffer, so all push contant packets are ignored during a
enum isl_aux_usage aux_usage =
anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
- VK_IMAGE_ASPECT_DEPTH_BIT, layout);
+ VK_IMAGE_ASPECT_DEPTH_BIT,
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+ layout);
cmd_buffer->state.hiz_enabled = aux_usage == ISL_AUX_USAGE_HIZ;
}
anv_cmd_buffer_add_secondary(primary, secondary);
}
+ /* The secondary isn't counted in our VF cache tracking so we need to
+ * invalidate the whole thing.
+ */
+ if (GEN_GEN >= 8 && GEN_GEN <= 9) {
+ primary->state.pending_pipe_bits |=
+ ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ }
+
/* The secondary may have selected a different pipeline (3D or compute) and
* may have changed the current L3$ configuration. Reset our tracking
* variables to invalid values to ensure that we re-emit these in the case
uint32_t l3cr;
anv_pack_struct(&l3cr, L3_ALLOCATION_REG,
-#if GEN_GEN < 12
+#if GEN_GEN < 11
.SLMEnable = has_slm,
#endif
#if GEN_GEN == 11
emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3);
#if GEN_IS_HASWELL
- if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) {
+ if (cmd_buffer->device->physical->cmd_parser_version >= 4) {
/* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
* them disabled to avoid crashing the system hard.
*/
void
genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
{
+ UNUSED const struct gen_device_info *devinfo = &cmd_buffer->device->info;
enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
+ if (cmd_buffer->device->physical->always_flush_cache)
+ bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
+
/* Flushes are pipelined while invalidations are handled immediately.
* Therefore, if we're flushing anything then we need to schedule a stall
* before any invalidations can happen.
bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
}
+ /* GEN:BUG:1409226450, Wait for EU to be idle before pipe control which
+ * invalidates the instruction cache
+ */
+ if (GEN_GEN == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))
+ bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+
+ if ((GEN_GEN >= 8 && GEN_GEN <= 9) &&
+ (bits & ANV_PIPE_CS_STALL_BIT) &&
+ (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+ /* If we are doing a VF cache invalidate AND a CS stall (it must be
+ * both) then we can reset our vertex cache tracking.
+ */
+ memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
+ sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
+ memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
+ sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+ }
+
+ /* Project: SKL / Argument: LRI Post Sync Operation [23]
+ *
+ * "PIPECONTROL command with “Command Streamer Stall Enable” must be
+ * programmed prior to programming a PIPECONTROL command with "LRI
+ * Post Sync Operation" in GPGPU mode of operation (i.e when
+ * PIPELINE_SELECT command is set to GPGPU mode of operation)."
+ *
+ * The same text exists a few rows below for Post Sync Op.
+ *
+ * On Gen12 this is GEN:BUG:1607156449.
+ */
+ if (bits & ANV_PIPE_POST_SYNC_BIT) {
+ if ((GEN_GEN == 9 || (GEN_GEN == 12 && devinfo->revision == 0 /* A0 */)) &&
+ cmd_buffer->state.current_pipeline == GPGPU)
+ bits |= ANV_PIPE_CS_STALL_BIT;
+ bits &= ~ANV_PIPE_POST_SYNC_BIT;
+ }
+
if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
#if GEN_GEN >= 12
pipe.RenderTargetCacheFlushEnable =
bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+ /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
+ * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
+ */
+#if GEN_GEN >= 12
+ pipe.DepthStallEnable =
+ pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
+#else
pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
+#endif
+
pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
* I chose "Stall at Pixel Scoreboard" since that's what we use in
* mesa and it seems to work fine. The choice is fairly arbitrary.
*/
- if ((bits & ANV_PIPE_CS_STALL_BIT) &&
- !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
- ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
+ if (pipe.CommandStreamerStallEnable &&
+ !pipe.RenderTargetCacheFlushEnable &&
+ !pipe.DepthCacheFlushEnable &&
+ !pipe.StallAtPixelScoreboard &&
+ !pipe.PostSyncOperation &&
+ !pipe.DepthStallEnable &&
+ !pipe.DCFlushEnable)
pipe.StallAtPixelScoreboard = true;
}
}
}
+#if GEN_GEN == 12
+ if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) &&
+ cmd_buffer->device->info.has_aux_map) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
+ lri.DataDWord = 1;
+ }
+ }
+#endif
+
bits &= ~ANV_PIPE_INVALIDATE_BITS;
}
}
}
+static struct anv_cmd_pipeline_state *
+pipe_state_for_stage(struct anv_cmd_buffer *cmd_buffer,
+ gl_shader_stage stage)
+{
+ switch (stage) {
+ case MESA_SHADER_COMPUTE:
+ return &cmd_buffer->state.compute.base;
+
+ case MESA_SHADER_VERTEX:
+ case MESA_SHADER_TESS_CTRL:
+ case MESA_SHADER_TESS_EVAL:
+ case MESA_SHADER_GEOMETRY:
+ case MESA_SHADER_FRAGMENT:
+ return &cmd_buffer->state.gfx.base;
+
+ default:
+ unreachable("invalid stage");
+ }
+}
+
static VkResult
emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
gl_shader_stage stage,
struct anv_state *bt_state)
{
struct anv_subpass *subpass = cmd_buffer->state.subpass;
- struct anv_cmd_pipeline_state *pipe_state;
- struct anv_pipeline *pipeline;
uint32_t state_offset;
- switch (stage) {
- case MESA_SHADER_COMPUTE:
- pipe_state = &cmd_buffer->state.compute.base;
- break;
- default:
- pipe_state = &cmd_buffer->state.gfx.base;
- break;
- }
- pipeline = pipe_state->pipeline;
+ struct anv_cmd_pipeline_state *pipe_state =
+ pipe_state_for_stage(cmd_buffer, stage);
+ struct anv_pipeline *pipeline = pipe_state->pipeline;
if (!anv_pipeline_has_stage(pipeline, stage)) {
*bt_state = (struct anv_state) { 0, };
* softpin then we always keep all user-allocated memory objects resident.
*/
const bool need_client_mem_relocs =
- !cmd_buffer->device->instance->physicalDevice.use_softpin;
+ !cmd_buffer->device->physical->use_softpin;
for (uint32_t s = 0; s < map->surface_count; s++) {
struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
struct anv_state *state)
{
struct anv_cmd_pipeline_state *pipe_state =
- stage == MESA_SHADER_COMPUTE ? &cmd_buffer->state.compute.base :
- &cmd_buffer->state.gfx.base;
+ pipe_state_for_stage(cmd_buffer, stage);
struct anv_pipeline *pipeline = pipe_state->pipeline;
if (!anv_pipeline_has_stage(pipeline, stage)) {
}
static uint32_t
-flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer)
+flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_pipeline *pipeline)
{
- struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline;
-
VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty &
pipeline->active_stages;
}
}
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+static struct anv_address
+get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
+ gl_shader_stage stage,
+ const struct anv_push_range *range)
+{
+ const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ switch (range->set) {
+ case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+ /* This is a descriptor set buffer so the set index is
+ * actually given by binding->binding. (Yes, that's
+ * confusing.)
+ */
+ struct anv_descriptor_set *set =
+ gfx_state->base.descriptors[range->index];
+ return anv_descriptor_set_address(cmd_buffer, set);
+ break;
+ }
+
+ case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
+ struct anv_state state =
+ anv_cmd_buffer_push_constants(cmd_buffer, stage);
+ return (struct anv_address) {
+ .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+ .offset = state.offset,
+ };
+ break;
+ }
+
+ default: {
+ assert(range->set < MAX_SETS);
+ struct anv_descriptor_set *set =
+ gfx_state->base.descriptors[range->set];
+ const struct anv_descriptor *desc =
+ &set->descriptors[range->index];
+
+ if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+ return desc->buffer_view->address;
+ } else {
+ assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+ struct anv_push_constants *push =
+ &cmd_buffer->state.push_constants[stage];
+ uint32_t dynamic_offset =
+ push->dynamic_offsets[range->dynamic_offset_index];
+ return anv_address_add(desc->buffer->address,
+ desc->offset + dynamic_offset);
+ }
+ }
+ }
+}
+#endif
+
static void
-cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
- VkShaderStageFlags dirty_stages)
+cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
+ gl_shader_stage stage, unsigned buffer_count)
{
const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
const struct anv_pipeline *pipeline = gfx_state->base.pipeline;
[MESA_SHADER_COMPUTE] = 0,
};
- VkShaderStageFlags flushed = 0;
+ assert(stage < ARRAY_SIZE(push_constant_opcodes));
+ assert(push_constant_opcodes[stage] > 0);
- anv_foreach_stage(stage, dirty_stages) {
- assert(stage < ARRAY_SIZE(push_constant_opcodes));
- assert(push_constant_opcodes[stage] > 0);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
- c._3DCommandSubOpcode = push_constant_opcodes[stage];
-
- if (anv_pipeline_has_stage(pipeline, stage)) {
- const struct anv_pipeline_bind_map *bind_map =
- &pipeline->shaders[stage]->bind_map;
-
- for (unsigned i = 0; i < 4; i++) {
- const struct anv_push_range *range = &bind_map->push_ranges[i];
- if (range->length == 0)
- continue;
-
- struct anv_address addr;
- switch (range->set) {
- case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
- /* This is a descriptor set buffer so the set index is
- * actually given by binding->binding. (Yes, that's
- * confusing.)
- */
- struct anv_descriptor_set *set =
- gfx_state->base.descriptors[range->index];
- addr = anv_descriptor_set_address(cmd_buffer, set);
- break;
- }
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
+ c._3DCommandSubOpcode = push_constant_opcodes[stage];
- case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
- struct anv_state state =
- anv_cmd_buffer_push_constants(cmd_buffer, stage);
- addr = (struct anv_address) {
- .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = state.offset,
- };
- break;
- }
+ if (anv_pipeline_has_stage(pipeline, stage)) {
+ const struct anv_pipeline_bind_map *bind_map =
+ &pipeline->shaders[stage]->bind_map;
- default: {
- assert(range->set < MAX_SETS);
- struct anv_descriptor_set *set =
- gfx_state->base.descriptors[range->set];
- const struct anv_descriptor *desc =
- &set->descriptors[range->index];
-
- if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
- addr = desc->buffer_view->address;
- } else {
- assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
- struct anv_push_constants *push =
- &cmd_buffer->state.push_constants[stage];
- uint32_t dynamic_offset =
- push->dynamic_offsets[range->dynamic_offset_index];
- addr = anv_address_add(desc->buffer->address,
- desc->offset + dynamic_offset);
- }
- }
- }
+#if GEN_GEN >= 12
+ c.MOCS = cmd_buffer->device->isl_dev.mocs.internal;
+#endif
- c.ConstantBody.ReadLength[i] = range->length;
- c.ConstantBody.Buffer[i] =
- anv_address_add(addr, range->start * 32);
- }
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+ /* The Skylake PRM contains the following restriction:
+ *
+ * "The driver must ensure The following case does not occur
+ * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+ * buffer 3 read length equal to zero committed followed by a
+ * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+ * zero committed."
+ *
+ * To avoid this, we program the buffers in the highest slots.
+ * This way, slot 0 is only used if slot 3 is also used.
+ */
+ assert(buffer_count <= 4);
+ const unsigned shift = 4 - buffer_count;
+ for (unsigned i = 0; i < buffer_count; i++) {
+ const struct anv_push_range *range = &bind_map->push_ranges[i];
+
+ /* At this point we only have non-empty ranges */
+ assert(range->length > 0);
+
+ /* For Ivy Bridge, make sure we only set the first range (actual
+ * push constants)
+ */
+ assert((GEN_GEN >= 8 || GEN_IS_HASWELL) || i == 0);
+
+ const struct anv_address addr =
+ get_push_range_address(cmd_buffer, stage, range);
+ c.ConstantBody.ReadLength[i + shift] = range->length;
+ c.ConstantBody.Buffer[i + shift] =
+ anv_address_add(addr, range->start * 32);
+ }
+#else
+ /* For Ivy Bridge, push constants are relative to dynamic state
+ * base address and we only ever push actual push constants.
+ */
+ if (bind_map->push_ranges[0].length > 0) {
+ assert(bind_map->push_ranges[0].set ==
+ ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
+ struct anv_state state =
+ anv_cmd_buffer_push_constants(cmd_buffer, stage);
+ c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
+ c.ConstantBody.Buffer[0].bo = NULL;
+ c.ConstantBody.Buffer[0].offset = state.offset;
}
+ assert(bind_map->push_ranges[1].length == 0);
+ assert(bind_map->push_ranges[2].length == 0);
+ assert(bind_map->push_ranges[3].length == 0);
+#endif
}
-
- flushed |= mesa_to_vk_shader_stage(stage);
}
-
- cmd_buffer->state.push_constants_dirty &= ~flushed;
}
#if GEN_GEN >= 12
-void
-genX(cmd_buffer_aux_map_state)(struct anv_cmd_buffer *cmd_buffer)
+static void
+cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t shader_mask, uint32_t count)
{
- void *aux_map_ctx = cmd_buffer->device->aux_map_ctx;
- if (!aux_map_ctx)
+ if (count == 0) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+ c.ShaderUpdateEnable = shader_mask;
+ c.MOCS = cmd_buffer->device->isl_dev.mocs.internal;
+ }
return;
- uint32_t aux_map_state_num = gen_aux_map_get_state_num(aux_map_ctx);
- if (cmd_buffer->state.last_aux_map_state != aux_map_state_num) {
- /* If the aux-map state number increased, then we need to rewrite the
- * register. Rewriting the register is used to both set the aux-map
- * translation table address, and also to invalidate any previously
- * cached translations.
+ }
+
+ const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ const struct anv_pipeline *pipeline = gfx_state->base.pipeline;
+
+ static const uint32_t push_constant_opcodes[] = {
+ [MESA_SHADER_VERTEX] = 21,
+ [MESA_SHADER_TESS_CTRL] = 25, /* HS */
+ [MESA_SHADER_TESS_EVAL] = 26, /* DS */
+ [MESA_SHADER_GEOMETRY] = 22,
+ [MESA_SHADER_FRAGMENT] = 23,
+ [MESA_SHADER_COMPUTE] = 0,
+ };
+
+ gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
+ assert(stage < ARRAY_SIZE(push_constant_opcodes));
+ assert(push_constant_opcodes[stage] > 0);
+
+ const struct anv_pipeline_bind_map *bind_map =
+ &pipeline->shaders[stage]->bind_map;
+
+ uint32_t *dw;
+ const uint32_t buffers = (1 << count) - 1;
+ const uint32_t num_dwords = 2 + 2 * count;
+
+ dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+ GENX(3DSTATE_CONSTANT_ALL),
+ .ShaderUpdateEnable = shader_mask,
+ .PointerBufferMask = buffers,
+ .MOCS = cmd_buffer->device->isl_dev.mocs.internal);
+
+ for (int i = 0; i < count; i++) {
+ const struct anv_push_range *range = &bind_map->push_ranges[i];
+ const struct anv_address addr =
+ get_push_range_address(cmd_buffer, stage, range);
+
+ GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+ &cmd_buffer->batch, dw + 2 + i * 2,
+ &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+ .PointerToConstantBuffer = anv_address_add(addr, range->start * 32),
+ .ConstantBufferReadLength = range->length,
+ });
+ }
+}
+#endif
+
+static void
+cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
+ VkShaderStageFlags dirty_stages)
+{
+ VkShaderStageFlags flushed = 0;
+ const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ const struct anv_pipeline *pipeline = gfx_state->base.pipeline;
+
+#if GEN_GEN >= 12
+ uint32_t nobuffer_stages = 0;
+#endif
+
+ anv_foreach_stage(stage, dirty_stages) {
+ unsigned buffer_count = 0;
+ flushed |= mesa_to_vk_shader_stage(stage);
+ uint32_t max_push_range = 0;
+
+ if (anv_pipeline_has_stage(pipeline, stage)) {
+ const struct anv_pipeline_bind_map *bind_map =
+ &pipeline->shaders[stage]->bind_map;
+
+ for (unsigned i = 0; i < 4; i++) {
+ const struct anv_push_range *range = &bind_map->push_ranges[i];
+ if (range->length > 0) {
+ buffer_count++;
+ if (GEN_GEN >= 12 && range->length > max_push_range)
+ max_push_range = range->length;
+ }
+ }
+ }
+
+#if GEN_GEN >= 12
+ /* If this stage doesn't have any push constants, emit it later in a
+ * single CONSTANT_ALL packet.
*/
- uint64_t base_addr = gen_aux_map_get_base(aux_map_ctx);
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
- lri.DataDWord = base_addr & 0xffffffff;
+ if (buffer_count == 0) {
+ nobuffer_stages |= 1 << stage;
+ continue;
}
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
- lri.DataDWord = base_addr >> 32;
+
+ /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
+ * contains only 5 bits, so we can only use it for buffers smaller than
+ * 32.
+ */
+ if (max_push_range < 32) {
+ cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
+ buffer_count);
+ continue;
}
- cmd_buffer->state.last_aux_map_state = aux_map_state_num;
+#endif
+
+ cmd_buffer_emit_push_constant(cmd_buffer, stage, buffer_count);
}
-}
+
+#if GEN_GEN >= 12
+ if (nobuffer_stages)
+ cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, 0);
#endif
+ cmd_buffer->state.push_constants_dirty &= ~flushed;
+}
+
void
genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
{
assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
- genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
+ genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->l3_config);
genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
genX(flush_pipeline_select_3d)(cmd_buffer);
-#if GEN_GEN >= 12
- genX(cmd_buffer_aux_map_state)(cmd_buffer);
-#endif
-
if (vb_emit) {
const uint32_t num_buffers = __builtin_popcount(vb_emit);
const uint32_t num_dwords = 1 + num_buffers * 4;
#endif
};
+#if GEN_GEN >= 8 && GEN_GEN <= 9
+ genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb,
+ state.BufferStartingAddress,
+ state.BufferSize);
+#endif
+
GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
i++;
}
*/
uint32_t dirty = 0;
if (cmd_buffer->state.descriptors_dirty)
- dirty = flush_descriptor_sets(cmd_buffer);
+ dirty = flush_descriptor_sets(cmd_buffer, pipeline);
if (dirty || cmd_buffer->state.push_constants_dirty) {
/* Because we're pushing UBOs, we have to push whenever either
gen7_cmd_buffer_emit_scissor(cmd_buffer);
genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
-
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
static void
.EndAddress = anv_address_add(addr, size),
#endif
});
+
+ genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
+ index, addr, size);
}
static void
emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
}
+static void
+update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t access_type)
+{
+ struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline;
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+ uint64_t vb_used = pipeline->vb_used;
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance)
+ vb_used |= 1ull << ANV_SVGS_VB_INDEX;
+ if (vs_prog_data->uses_drawid)
+ vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
+
+ genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer,
+ access_type == RANDOM,
+ vb_used);
+}
+
void genX(CmdDraw)(
VkCommandBuffer commandBuffer,
uint32_t vertexCount,
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, 0);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
/* Our implementation of VK_KHR_multiview uses instancing to draw the
* different views. We need to multiply instanceCount by the view count.
*/
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = 0;
}
+
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
}
void genX(CmdDrawIndexed)(
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, 0);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
/* Our implementation of VK_KHR_multiview uses instancing to draw the
* different views. We need to multiply instanceCount by the view count.
*/
prim.StartInstanceLocation = firstInstance;
prim.BaseVertexLocation = vertexOffset;
}
+
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
}
/* Auto-Draw / Indirect Registers */
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, 0);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
/* Our implementation of VK_KHR_multiview uses instancing to draw the
* different views. We need to multiply instanceCount by the view count.
*/
prim.VertexAccessType = SEQUENTIAL;
prim.PrimitiveTopologyType = pipeline->topology;
}
+
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
}
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, i);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
load_indirect_parameters(cmd_buffer, draw, false);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
+
offset += stride;
}
}
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, i);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
load_indirect_parameters(cmd_buffer, draw, true);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
+
offset += stride;
}
}
}
#endif
-void genX(CmdDrawIndirectCountKHR)(
+void genX(CmdDrawIndirectCount)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, i);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
load_indirect_parameters(cmd_buffer, draw, false);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
+
offset += stride;
}
}
-void genX(CmdDrawIndexedIndirectCountKHR)(
+void genX(CmdDrawIndexedIndirectCount)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
if (vs_prog_data->uses_drawid)
emit_draw_index(cmd_buffer, i);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
load_indirect_parameters(cmd_buffer, draw, true);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.PrimitiveTopologyType = pipeline->topology;
}
+ update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
+
offset += stride;
}
}
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
}
-static VkResult
-flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
-{
- struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline;
- struct anv_state surfaces = { 0, }, samplers = { 0, };
- VkResult result;
-
- result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
- if (result != VK_SUCCESS) {
- assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
-
- result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
- if (result != VK_SUCCESS)
- return result;
-
- /* Re-emit state base addresses so we get the new surface state base
- * address before we start emitting binding tables etc.
- */
- genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
-
- result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
- if (result != VK_SUCCESS) {
- anv_batch_set_error(&cmd_buffer->batch, result);
- return result;
- }
- }
-
- result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers);
- if (result != VK_SUCCESS) {
- anv_batch_set_error(&cmd_buffer->batch, result);
- return result;
- }
-
- uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
- struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
- .BindingTablePointer = surfaces.offset,
- .SamplerStatePointer = samplers.offset,
- };
- GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
-
- struct anv_state state =
- anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
- pipeline->interface_descriptor_data,
- GENX(INTERFACE_DESCRIPTOR_DATA_length),
- 64);
-
- uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
- anv_batch_emit(&cmd_buffer->batch,
- GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
- mid.InterfaceDescriptorTotalLength = size;
- mid.InterfaceDescriptorDataStartAddress = state.offset;
- }
-
- return VK_SUCCESS;
-}
-
void
genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline;
- VkResult result;
assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
- genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
+ genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->l3_config);
genX(flush_pipeline_select_gpgpu)(cmd_buffer);
-#if GEN_GEN >= 12
- genX(cmd_buffer_aux_map_state)(cmd_buffer);
-#endif
-
if (cmd_buffer->state.compute.pipeline_dirty) {
/* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
*
if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
cmd_buffer->state.compute.pipeline_dirty) {
- /* FIXME: figure out descriptors for gen7 */
- result = flush_compute_descriptor_set(cmd_buffer);
- if (result != VK_SUCCESS)
- return;
+ flush_descriptor_sets(cmd_buffer, pipeline);
+
+ uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
+ struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
+ .BindingTablePointer =
+ cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
+ .SamplerStatePointer =
+ cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
+ };
+ GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
+
+ struct anv_state state =
+ anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
+ pipeline->interface_descriptor_data,
+ GENX(INTERFACE_DESCRIPTOR_DATA_length),
+ 64);
- cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+ uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+ mid.InterfaceDescriptorTotalLength = size;
+ mid.InterfaceDescriptorDataStartAddress = state.offset;
+ }
}
if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
int required_version,
const char *function)
{
- if (device->instance->physicalDevice.cmd_parser_version < required_version) {
- return vk_errorf(device->instance, device->instance,
+ if (device->physical->cmd_parser_version < required_version) {
+ return vk_errorf(device, device->physical,
VK_ERROR_FEATURE_NOT_PRESENT,
"cmd parser version %d is required for %s",
required_version, function);
.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
.offset = state.offset,
};
+
+ /* The num_workgroups buffer goes in the binding table */
+ cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
}
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
return;
#endif
- if (prog_data->uses_num_work_groups)
+ if (prog_data->uses_num_work_groups) {
cmd_buffer->state.compute.num_workgroups = addr;
+ /* The num_workgroups buffer goes in the binding table */
+ cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ }
+
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
struct gen_mi_builder b;
* really know why.
*/
const uint32_t subslices =
- MAX2(cmd_buffer->device->instance->physicalDevice.subslice_total, 1);
+ MAX2(cmd_buffer->device->physical->subslice_total, 1);
anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
vfe.MaximumNumberofThreads =
devinfo->max_cs_threads * subslices - 1;
vfe.NumberofURBEntries = 2;
vfe.URBEntryAllocationSize = 2;
}
+
+ /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
+ * invalid. Set the compute pipeline to dirty to force a re-emit of the
+ * pipeline in case we get back-to-back dispatch calls with the same
+ * pipeline and a PIPELINE_SELECT in between.
+ */
+ cmd_buffer->state.compute.pipeline_dirty = true;
}
#endif
pc.CommandStreamerStallEnable = true;
#if GEN_GEN >= 12
pc.TileCacheFlushEnable = true;
+
+ /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be
+ * set with any PIPE_CONTROL with Depth Flush Enable bit set.
+ */
+ pc.DepthStallEnable = true;
#endif
}
}
}
+/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
+ *
+ * "The VF cache needs to be invalidated before binding and then using
+ * Vertex Buffers that overlap with any previously bound Vertex Buffer
+ * (at a 64B granularity) since the last invalidation. A VF cache
+ * invalidate is performed by setting the "VF Cache Invalidation Enable"
+ * bit in PIPE_CONTROL."
+ *
+ * This is implemented by carefully tracking all vertex and index buffer
+ * bindings and flushing if the cache ever ends up with a range in the cache
+ * that would exceed 4 GiB. This is implemented in three parts:
+ *
+ * 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called
+ * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
+ * tracking code of the new binding. If this new binding would cause
+ * the cache to have a too-large range on the next draw call, a pipeline
+ * stall and VF cache invalidate are added to pending_pipeline_bits.
+ *
+ * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
+ * empty whenever we emit a VF invalidate.
+ *
+ * 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called
+ * after every 3DPRIMITIVE and copies the bound range into the dirty
+ * range for each used buffer. This has to be a separate step because
+ * we don't always re-bind all buffers and so 1. can't know which
+ * buffers are actually bound.
+ */
+void
+genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+ int vb_index,
+ struct anv_address vb_address,
+ uint32_t vb_size)
+{
+ if (GEN_GEN < 8 || GEN_GEN > 9 ||
+ !cmd_buffer->device->physical->use_softpin)
+ return;
+
+ struct anv_vb_cache_range *bound, *dirty;
+ if (vb_index == -1) {
+ bound = &cmd_buffer->state.gfx.ib_bound_range;
+ dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+ } else {
+ assert(vb_index >= 0);
+ assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+ assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+ bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
+ dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
+ }
+
+ if (vb_size == 0) {
+ bound->start = 0;
+ bound->end = 0;
+ return;
+ }
+
+ assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
+ bound->start = gen_48b_address(anv_address_physical(vb_address));
+ bound->end = bound->start + vb_size;
+ assert(bound->end > bound->start); /* No overflow */
+
+ /* Align everything to a cache line */
+ bound->start &= ~(64ull - 1ull);
+ bound->end = align_u64(bound->end, 64);
+
+ /* Compute the dirty range */
+ dirty->start = MIN2(dirty->start, bound->start);
+ dirty->end = MAX2(dirty->end, bound->end);
+
+ /* If our range is larger than 32 bits, we have to flush */
+ assert(bound->end - bound->start <= (1ull << 32));
+ if (dirty->end - dirty->start > (1ull << 32)) {
+ cmd_buffer->state.pending_pipe_bits |=
+ ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ }
+}
+
+void
+genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t access_type,
+ uint64_t vb_used)
+{
+ if (GEN_GEN < 8 || GEN_GEN > 9 ||
+ !cmd_buffer->device->physical->use_softpin)
+ return;
+
+ if (access_type == RANDOM) {
+ /* We have an index buffer */
+ struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
+ struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+
+ if (bound->end > bound->start) {
+ dirty->start = MIN2(dirty->start, bound->start);
+ dirty->end = MAX2(dirty->end, bound->end);
+ }
+ }
+
+ uint64_t mask = vb_used;
+ while (mask) {
+ int i = u_bit_scan64(&mask);
+ assert(i >= 0);
+ assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+ assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+
+ struct anv_vb_cache_range *bound, *dirty;
+ bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
+ dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
+
+ if (bound->end > bound->start) {
+ dirty->start = MIN2(dirty->start, bound->start);
+ dirty->end = MAX2(dirty->end, bound->end);
+ }
+ }
+}
+
/**
* Update the pixel hashing modes that determine the balancing of PS threads
* across subslices and slices.
isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
if (GEN_GEN >= 12) {
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
/* GEN:BUG:1408224581
*
* Workaround: Gen12LP Astep only An additional pipe control with
(att_state->fast_clear && !att_state->clear_color_is_zero_one) ||
att_state->input_aux_usage != att_state->aux_usage;
- VkImageLayout target_layout, target_stencil_layout;
+ VkImageLayout target_layout;
if (iview->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV &&
!input_needs_resolve) {
/* Layout transitions before the final only help to enable sampling
target_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
} else {
target_layout = subpass->attachments[i].layout;
- target_stencil_layout = subpass->attachments[i].stencil_layout;
}
+ VkImageLayout target_stencil_layout =
+ subpass->attachments[i].stencil_layout;
+
uint32_t base_layer, layer_count;
if (image->type == VK_IMAGE_TYPE_3D) {
base_layer = 0;
att_state->current_layout, target_layout);
att_state->aux_usage =
anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
- VK_IMAGE_ASPECT_DEPTH_BIT, target_layout);
+ VK_IMAGE_ASPECT_DEPTH_BIT,
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+ target_layout);
}
if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
if (GEN_GEN < 10 &&
(att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) &&
- image->planes[0].aux_surface.isl.size_B > 0 &&
+ image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
iview->planes[0].isl.base_level == 0 &&
iview->planes[0].isl.base_array_layer == 0) {
if (att_state->aux_usage != ISL_AUX_USAGE_NONE) {
* is set due to new association of BTI, PS Scoreboard Stall bit must
* be set in this packet."
*/
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.RenderTargetCacheFlushEnable = true;
- pc.StallAtPixelScoreboard = true;
-#if GEN_GEN >= 12
- pc.TileCacheFlushEnable = true;
-#endif
- }
+ cmd_buffer->state.pending_pipe_bits |=
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
#endif
}
*/
transition_depth_buffer(cmd_buffer, src_iview->image,
src_state->current_layout,
- VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
src_state->aux_usage =
anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image,
VK_IMAGE_ASPECT_DEPTH_BIT,
- VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
- src_state->current_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+ VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+ src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
/* MSAA resolves write to the resolve attachment as if it were any
* other transfer op. Transition the resolve attachment accordingly.
dst_state->aux_usage =
anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image,
VK_IMAGE_ASPECT_DEPTH_BIT,
+ VK_IMAGE_USAGE_TRANSFER_DST_BIT,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
if ((src_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {
- src_state->current_stencil_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+ src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE;
* SRGB view & a UNORM image).
*/
if (fast_clear_type != ANV_FAST_CLEAR_NONE) {
- anv_perf_warn(cmd_buffer->device->instance, iview,
+ anv_perf_warn(cmd_buffer->device, iview,
"Doing a partial resolve to get rid of clear color at the "
"end of a renderpass due to an image/view format mismatch");
cmd_buffer_begin_subpass(cmd_buffer, 0);
}
-void genX(CmdBeginRenderPass2KHR)(
+void genX(CmdBeginRenderPass2)(
VkCommandBuffer commandBuffer,
const VkRenderPassBeginInfo* pRenderPassBeginInfo,
const VkSubpassBeginInfoKHR* pSubpassBeginInfo)
cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
}
-void genX(CmdNextSubpass2KHR)(
+void genX(CmdNextSubpass2)(
VkCommandBuffer commandBuffer,
const VkSubpassBeginInfoKHR* pSubpassBeginInfo,
const VkSubpassEndInfoKHR* pSubpassEndInfo)
cmd_buffer->state.subpass = NULL;
}
-void genX(CmdEndRenderPass2KHR)(
+void genX(CmdEndRenderPass2)(
VkCommandBuffer commandBuffer,
const VkSubpassEndInfoKHR* pSubpassEndInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_event, event, _event);
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
pc.StallAtPixelScoreboard = true;
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_event, event, _event);
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
pc.StallAtPixelScoreboard = true;