X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fintel%2Fvulkan%2FgenX_cmd_buffer.c;h=6e42b659f4c89532ad6660b441ff0a6f51a8c80d;hb=e63c662c26a6abfab5abf03a1646a236d6d730c0;hp=6fb19661ebba5d0d39409dcaeb8ec1fb45173190;hpb=643248b66a265242ef3de3dffdae79e1283eee31;p=mesa.git diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 6fb19661ebb..b2b292c1ff3 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -27,19 +27,21 @@ #include "anv_private.h" #include "vk_format_info.h" #include "vk_util.h" +#include "util/fast_idiv_by_const.h" +#include "common/gen_aux_map.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" -static void -emit_lrm(struct anv_batch *batch, uint32_t reg, struct anv_address addr) -{ - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = reg; - lrm.MemoryAddress = addr; - } -} +/* We reserve GPR 14 and 15 for conditional rendering */ +#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14 +#define __gen_get_batch_dwords anv_batch_emit_dwords +#define __gen_address_offset anv_address_add +#include "common/gen_mi_builder.h" + +static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, + uint32_t pipeline); static void emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) @@ -50,21 +52,12 @@ emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) } } -#if GEN_IS_HASWELL || GEN_GEN >= 8 -static void -emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) -{ - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { - lrr.SourceRegisterAddress = src; - lrr.DestinationRegisterAddress = dst; - } -} -#endif - void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; + UNUSED const struct gen_device_info *devinfo = &device->info; + uint32_t mocs = device->isl_dev.mocs.internal; /* If we are emitting a new state base address we probably need to re-emit * binding tables. @@ -82,30 +75,56 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) pc.DCFlushEnable = true; pc.RenderTargetCacheFlushEnable = true; pc.CommandStreamerStallEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; +#endif +#if GEN_GEN == 12 + /* GEN:BUG:1606662791: + * + * Software must program PIPE_CONTROL command with "HDC Pipeline + * Flush" prior to programming of the below two non-pipeline state : + * * STATE_BASE_ADDRESS + * * 3DSTATE_BINDING_TABLE_POOL_ALLOC + */ + if (devinfo->revision == 0 /* A0 */) + pc.HDCPipelineFlushEnable = true; +#endif } +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline + * mode by putting the pipeline temporarily in 3D mode. + */ + uint32_t gen12_wa_pipeline = cmd_buffer->state.current_pipeline; + genX(flush_pipeline_select_3d)(cmd_buffer); +#endif + anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.GeneralStateMOCS = GENX(MOCS); + sba.GeneralStateMOCS = mocs; sba.GeneralStateBaseAddressModifyEnable = true; + sba.StatelessDataPortAccessMOCS = mocs; + sba.SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer); - sba.SurfaceStateMOCS = GENX(MOCS); + sba.SurfaceStateMOCS = mocs; sba.SurfaceStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddress = (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 }; - sba.DynamicStateMOCS = GENX(MOCS); + sba.DynamicStateMOCS = mocs; sba.DynamicStateBaseAddressModifyEnable = true; sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; - sba.IndirectObjectMOCS = GENX(MOCS); + sba.IndirectObjectMOCS = mocs; sba.IndirectObjectBaseAddressModifyEnable = true; sba.InstructionBaseAddress = (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 }; - sba.InstructionMOCS = GENX(MOCS); + sba.InstructionMOCS = mocs; sba.InstructionBaseAddressModifyEnable = true; # if (GEN_GEN >= 8) @@ -113,29 +132,71 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) * these fields. However, since we will be growing the BO's live, we * just set them all to the maximum. */ - sba.GeneralStateBufferSize = 0xfffff; + sba.GeneralStateBufferSize = 0xfffff; + sba.IndirectObjectBufferSize = 0xfffff; + if (device->physical->use_softpin) { + /* With softpin, we use fixed addresses so we actually know how big + * our base addresses are. + */ + sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096; + sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096; + } else { + sba.DynamicStateBufferSize = 0xfffff; + sba.InstructionBufferSize = 0xfffff; + } sba.GeneralStateBufferSizeModifyEnable = true; - sba.DynamicStateBufferSize = 0xfffff; - sba.DynamicStateBufferSizeModifyEnable = true; - sba.IndirectObjectBufferSize = 0xfffff; sba.IndirectObjectBufferSizeModifyEnable = true; - sba.InstructionBufferSize = 0xfffff; + sba.DynamicStateBufferSizeModifyEnable = true; sba.InstructionBuffersizeModifyEnable = true; +# else + /* On gen7, we have upper bounds instead. According to the docs, + * setting an upper bound of zero means that no bounds checking is + * performed so, in theory, we should be able to leave them zero. + * However, border color is broken and the GPU bounds-checks anyway. + * To avoid this and other potential problems, we may as well set it + * for everything. + */ + sba.GeneralStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.GeneralStateAccessUpperBoundModifyEnable = true; + sba.DynamicStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.DynamicStateAccessUpperBoundModifyEnable = true; + sba.InstructionAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.InstructionAccessUpperBoundModifyEnable = true; # endif # if (GEN_GEN >= 9) - sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.BindlessSurfaceStateMOCS = GENX(MOCS); + if (cmd_buffer->device->physical->use_softpin) { + sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { + .bo = device->surface_state_pool.block_pool.bo, + .offset = 0, + }; + sba.BindlessSurfaceStateSize = (1 << 20) - 1; + } else { + sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS; + sba.BindlessSurfaceStateSize = 0; + } + sba.BindlessSurfaceStateMOCS = mocs; sba.BindlessSurfaceStateBaseAddressModifyEnable = true; - sba.BindlessSurfaceStateSize = 0; # endif # if (GEN_GEN >= 10) sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.BindlessSamplerStateMOCS = GENX(MOCS); + sba.BindlessSamplerStateMOCS = mocs; sba.BindlessSamplerStateBaseAddressModifyEnable = true; sba.BindlessSamplerStateBufferSize = 0; # endif } +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Put the pipeline back into its current mode. + */ + if (gen12_wa_pipeline != UINT32_MAX) + genX(flush_pipeline_select)(cmd_buffer, gen12_wa_pipeline); +#endif + /* After re-setting the surface state base address, we have to do some * cache flusing so that the sampler engine will pick up the new * SURFACE_STATE objects and binding tables. From the Broadwell PRM, @@ -189,7 +250,7 @@ add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, VkResult result = anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.offset + isl_dev->ss.addr_offset, - addr.bo, addr.offset); + addr.bo, addr.offset, NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -208,7 +269,9 @@ add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.state.offset + isl_dev->ss.aux_addr_offset, - state.aux_address.bo, state.aux_address.offset); + state.aux_address.bo, + state.aux_address.offset, + NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -219,232 +282,269 @@ add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, &cmd_buffer->pool->alloc, state.state.offset + isl_dev->ss.clear_color_state_offset, - state.clear_address.bo, state.clear_address.offset); + state.clear_address.bo, + state.clear_address.offset, + NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } } -static void -color_attachment_compute_aux_usage(struct anv_device * device, - struct anv_cmd_state * cmd_state, - uint32_t att, VkRect2D render_area, - union isl_color_value *fast_clear_color) +static bool +isl_color_value_requires_conversion(union isl_color_value color, + const struct isl_surf *surf, + const struct isl_view *view) { - struct anv_attachment_state *att_state = &cmd_state->attachments[att]; - struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; + if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle)) + return false; + + uint32_t surf_pack[4] = { 0, 0, 0, 0 }; + isl_color_value_pack(&color, surf->format, surf_pack); + + uint32_t view_pack[4] = { 0, 0, 0, 0 }; + union isl_color_value swiz_color = + isl_color_value_swizzle_inv(color, view->swizzle); + isl_color_value_pack(&swiz_color, view->format, view_pack); - assert(iview->n_planes == 1); + return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0; +} +static bool +anv_can_fast_clear_color_view(struct anv_device * device, + struct anv_image_view *iview, + VkImageLayout layout, + union isl_color_value clear_color, + uint32_t num_layers, + VkRect2D render_area) +{ if (iview->planes[0].isl.base_array_layer >= anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level)) { - /* There is no aux buffer which corresponds to the level and layer(s) - * being accessed. - */ - att_state->aux_usage = ISL_AUX_USAGE_NONE; - att_state->input_aux_usage = ISL_AUX_USAGE_NONE; - att_state->fast_clear = false; - return; - } - - att_state->aux_usage = - anv_layout_to_aux_usage(&device->info, iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); + iview->planes[0].isl.base_level)) + return false; - /* If we don't have aux, then we should have returned early in the layer - * check above. If we got here, we must have something. + /* Start by getting the fast clear type. We use the first subpass + * layout here because we don't want to fast-clear if the first subpass + * to use the attachment can't handle fast-clears. */ - assert(att_state->aux_usage != ISL_AUX_USAGE_NONE); - - if (att_state->aux_usage == ISL_AUX_USAGE_CCS_E || - att_state->aux_usage == ISL_AUX_USAGE_MCS) { - att_state->input_aux_usage = att_state->aux_usage; - } else { - /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: - * - * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D - * setting is only allowed if Surface Format supported for Fast - * Clear. In addition, if the surface is bound to the sampling - * engine, Surface Format must be supported for Render Target - * Compression for surfaces bound to the sampling engine." - * - * In other words, we can only sample from a fast-cleared image if it - * also supports color compression. - */ - if (isl_format_supports_ccs_e(&device->info, iview->planes[0].isl.format)) { - att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; - - /* While fast-clear resolves and partial resolves are fairly cheap in the - * case where you render to most of the pixels, full resolves are not - * because they potentially involve reading and writing the entire - * framebuffer. If we can't texture with CCS_E, we should leave it off and - * limit ourselves to fast clears. - */ - if (cmd_state->pass->attachments[att].first_subpass_layout == - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { - anv_perf_warn(device->instance, iview->image, - "Not temporarily enabling CCS_E."); - } - } else { - att_state->input_aux_usage = ISL_AUX_USAGE_NONE; - } + enum anv_fast_clear_type fast_clear_type = + anv_layout_to_fast_clear_type(&device->info, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + layout); + switch (fast_clear_type) { + case ANV_FAST_CLEAR_NONE: + return false; + case ANV_FAST_CLEAR_DEFAULT_VALUE: + if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format)) + return false; + break; + case ANV_FAST_CLEAR_ANY: + break; } - assert(iview->image->planes[0].aux_surface.isl.usage & - (ISL_SURF_USAGE_CCS_BIT | ISL_SURF_USAGE_MCS_BIT)); - - union isl_color_value clear_color = {}; - anv_clear_color_from_att_state(&clear_color, att_state, iview); - - att_state->clear_color_is_zero_one = - isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format); - att_state->clear_color_is_zero = - isl_color_value_is_zero(clear_color, iview->planes[0].isl.format); - - if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { - /* Start by getting the fast clear type. We use the first subpass - * layout here because we don't want to fast-clear if the first subpass - * to use the attachment can't handle fast-clears. - */ - enum anv_fast_clear_type fast_clear_type = - anv_layout_to_fast_clear_type(&device->info, iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - cmd_state->pass->attachments[att].first_subpass_layout); - switch (fast_clear_type) { - case ANV_FAST_CLEAR_NONE: - att_state->fast_clear = false; - break; - case ANV_FAST_CLEAR_DEFAULT_VALUE: - att_state->fast_clear = att_state->clear_color_is_zero; - break; - case ANV_FAST_CLEAR_ANY: - att_state->fast_clear = true; - break; - } + /* Potentially, we could do partial fast-clears but doing so has crazy + * alignment restrictions. It's easier to just restrict to full size + * fast clears for now. + */ + if (render_area.offset.x != 0 || + render_area.offset.y != 0 || + render_area.extent.width != iview->extent.width || + render_area.extent.height != iview->extent.height) + return false; - /* Potentially, we could do partial fast-clears but doing so has crazy - * alignment restrictions. It's easier to just restrict to full size - * fast clears for now. - */ - if (render_area.offset.x != 0 || - render_area.offset.y != 0 || - render_area.extent.width != iview->extent.width || - render_area.extent.height != iview->extent.height) - att_state->fast_clear = false; + /* On Broadwell and earlier, we can only handle 0/1 clear colors */ + if (GEN_GEN <= 8 && + !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format)) + return false; - /* On Broadwell and earlier, we can only handle 0/1 clear colors */ - if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one) - att_state->fast_clear = false; + /* If the clear color is one that would require non-trivial format + * conversion on resolve, we don't bother with the fast clear. This + * shouldn't be common as most clear colors are 0/1 and the most common + * format re-interpretation is for sRGB. + */ + if (isl_color_value_requires_conversion(clear_color, + &iview->image->planes[0].surface.isl, + &iview->planes[0].isl)) { + anv_perf_warn(device, iview, + "Cannot fast-clear to colors which would require " + "format conversion on resolve"); + return false; + } - /* We only allow fast clears to the first slice of an image (level 0, - * layer 0) and only for the entire slice. This guarantees us that, at - * any given time, there is only one clear color on any given image at - * any given time. At the time of our testing (Jan 17, 2018), there - * were no known applications which would benefit from fast-clearing - * more than just the first slice. - */ - if (att_state->fast_clear && - (iview->planes[0].isl.base_level > 0 || - iview->planes[0].isl.base_array_layer > 0)) { - anv_perf_warn(device->instance, iview->image, - "Rendering with multi-lod or multi-layer framebuffer " - "with LOAD_OP_LOAD and baseMipLevel > 0 or " - "baseArrayLayer > 0. Not fast clearing."); - att_state->fast_clear = false; - } else if (att_state->fast_clear && cmd_state->framebuffer->layers > 1) { - anv_perf_warn(device->instance, iview->image, - "Rendering to a multi-layer framebuffer with " - "LOAD_OP_CLEAR. Only fast-clearing the first slice"); - } + /* We only allow fast clears to the first slice of an image (level 0, + * layer 0) and only for the entire slice. This guarantees us that, at + * any given time, there is only one clear color on any given image at + * any given time. At the time of our testing (Jan 17, 2018), there + * were no known applications which would benefit from fast-clearing + * more than just the first slice. + */ + if (iview->planes[0].isl.base_level > 0 || + iview->planes[0].isl.base_array_layer > 0) { + anv_perf_warn(device, iview->image, + "Rendering with multi-lod or multi-layer framebuffer " + "with LOAD_OP_LOAD and baseMipLevel > 0 or " + "baseArrayLayer > 0. Not fast clearing."); + return false; + } - if (att_state->fast_clear) - *fast_clear_color = clear_color; - } else { - att_state->fast_clear = false; + if (num_layers > 1) { + anv_perf_warn(device, iview->image, + "Rendering to a multi-layer framebuffer with " + "LOAD_OP_CLEAR. Only fast-clearing the first slice"); } + + return true; } -static void -depth_stencil_attachment_compute_aux_usage(struct anv_device *device, - struct anv_cmd_state *cmd_state, - uint32_t att, VkRect2D render_area) +static bool +anv_can_hiz_clear_ds_view(struct anv_device *device, + struct anv_image_view *iview, + VkImageLayout layout, + VkImageAspectFlags clear_aspects, + float depth_clear_value, + VkRect2D render_area) { - struct anv_render_pass_attachment *pass_att = - &cmd_state->pass->attachments[att]; - struct anv_attachment_state *att_state = &cmd_state->attachments[att]; - struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; - - /* These will be initialized after the first subpass transition. */ - att_state->aux_usage = ISL_AUX_USAGE_NONE; - att_state->input_aux_usage = ISL_AUX_USAGE_NONE; - - if (GEN_GEN == 7) { - /* We don't do any HiZ or depth fast-clears on gen7 yet */ - att_state->fast_clear = false; - return; - } - - if (!(att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { - /* If we're just clearing stencil, we can always HiZ clear */ - att_state->fast_clear = true; - return; - } + /* We don't do any HiZ or depth fast-clears on gen7 yet */ + if (GEN_GEN == 7) + return false; - /* Default to false for now */ - att_state->fast_clear = false; + /* If we're just clearing stencil, we can always HiZ clear */ + if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) + return true; /* We must have depth in order to have HiZ */ if (!(iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) - return; + return false; - const enum isl_aux_usage first_subpass_aux_usage = + const enum isl_aux_usage clear_aux_usage = anv_layout_to_aux_usage(&device->info, iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, - pass_att->first_subpass_layout); - if (first_subpass_aux_usage != ISL_AUX_USAGE_HIZ) - return; - - if (!blorp_can_hiz_clear_depth(GEN_GEN, - iview->planes[0].isl.format, - iview->image->samples, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + layout); + if (!blorp_can_hiz_clear_depth(&device->info, + &iview->image->planes[0].surface.isl, + clear_aux_usage, + iview->planes[0].isl.base_level, + iview->planes[0].isl.base_array_layer, render_area.offset.x, render_area.offset.y, render_area.offset.x + render_area.extent.width, render_area.offset.y + render_area.extent.height)) - return; + return false; - if (att_state->clear_value.depthStencil.depth != ANV_HZ_FC_VAL) - return; + if (depth_clear_value != ANV_HZ_FC_VAL) + return false; - if (GEN_GEN == 8 && anv_can_sample_with_hiz(&device->info, iview->image)) { - /* Only gen9+ supports returning ANV_HZ_FC_VAL when sampling a - * fast-cleared portion of a HiZ buffer. Testing has revealed that Gen8 - * only supports returning 0.0f. Gens prior to gen8 do not support this - * feature at all. - */ - return; - } + /* Only gen9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared + * portion of a HiZ buffer. Testing has revealed that Gen8 only supports + * returning 0.0f. Gens prior to gen8 do not support this feature at all. + */ + if (GEN_GEN == 8 && anv_can_sample_with_hiz(&device->info, iview->image)) + return false; /* If we got here, then we can fast clear */ - att_state->fast_clear = true; + return true; } -static bool -need_input_attachment_state(const struct anv_render_pass_attachment *att) +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + +#if GEN_GEN == 12 +static void +anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count) { - if (!(att->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) - return false; + uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); + + uint64_t base_address = + anv_address_physical(image->planes[plane].address); + + const struct isl_surf *isl_surf = &image->planes[plane].surface.isl; + uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf); - /* We only allocate input attachment states for color surfaces. Compression - * is not yet enabled for depth textures and stencil doesn't allow - * compression so we can just use the texture surface state from the view. + /* We're about to live-update the AUX-TT. We really don't want anyone else + * trying to read it while we're doing this. We could probably get away + * with not having this stall in some cases if we were really careful but + * it's better to play it safe. Full stall the GPU. */ - return vk_format_is_color(att->format); + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + for (uint32_t a = 0; a < layer_count; a++) { + const uint32_t layer = base_layer + a; + + uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0; + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + uint32_t logical_array_layer, logical_z_offset_px; + if (image->type == VK_IMAGE_TYPE_3D) { + logical_array_layer = 0; + + /* If the given miplevel does not have this layer, then any higher + * miplevels won't either because miplevels only get smaller the + * higher the LOD. + */ + assert(layer < image->extent.depth); + if (layer >= anv_minify(image->extent.depth, level)) + break; + logical_z_offset_px = layer; + } else { + assert(layer < image->array_size); + logical_array_layer = layer; + logical_z_offset_px = 0; + } + + uint32_t slice_start_offset_B, slice_end_offset_B; + isl_surf_get_image_range_B_tile(isl_surf, level, + logical_array_layer, + logical_z_offset_px, + &slice_start_offset_B, + &slice_end_offset_B); + + start_offset_B = MIN2(start_offset_B, slice_start_offset_B); + end_offset_B = MAX2(end_offset_B, slice_end_offset_B); + } + + /* Aux operates 64K at a time */ + start_offset_B = align_down_u64(start_offset_B, 64 * 1024); + end_offset_B = align_u64(end_offset_B, 64 * 1024); + + for (uint64_t offset = start_offset_B; + offset < end_offset_B; offset += 64 * 1024) { + uint64_t address = base_address + offset; + + uint64_t aux_entry_addr64, *aux_entry_map; + aux_entry_map = gen_aux_map_get_entry(cmd_buffer->device->aux_map_ctx, + address, &aux_entry_addr64); + + assert(cmd_buffer->device->physical->use_softpin); + struct anv_address aux_entry_address = { + .bo = NULL, + .offset = aux_entry_addr64, + }; + + const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map); + uint64_t new_aux_entry = + (old_aux_entry & GEN_AUX_MAP_ADDRESS_MASK) | format_bits; + + if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) + new_aux_entry |= GEN_AUX_MAP_ENTRY_VALID_BIT; + + gen_mi_store(&b, gen_mi_mem64(aux_entry_address), + gen_mi_imm(new_aux_entry)); + } + } + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; } +#endif /* GEN_GEN == 12 */ /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless * the initial layout is undefined, the HiZ buffer and depth buffer will @@ -453,34 +553,114 @@ need_input_attachment_state(const struct anv_render_pass_attachment *att) static void transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, + uint32_t base_layer, uint32_t layer_count, VkImageLayout initial_layout, VkImageLayout final_layout) { - const bool hiz_enabled = ISL_AUX_USAGE_HIZ == - anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, initial_layout); - const bool enable_hiz = ISL_AUX_USAGE_HIZ == - anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, final_layout); - - enum isl_aux_op hiz_op; - if (hiz_enabled && !enable_hiz) { - hiz_op = ISL_AUX_OP_FULL_RESOLVE; - } else if (!hiz_enabled && enable_hiz) { - hiz_op = ISL_AUX_OP_AMBIGUATE; - } else { - assert(hiz_enabled == enable_hiz); - /* If the same buffer will be used, no resolves are necessary. */ - hiz_op = ISL_AUX_OP_NONE; + uint32_t depth_plane = + anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT); + if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE) + return; + +#if GEN_GEN == 12 + if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || + initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) && + cmd_buffer->device->physical->has_implicit_ccs && + cmd_buffer->device->info.has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, 1, 0, 1); } +#endif + + const enum isl_aux_state initial_state = + anv_layout_to_aux_state(&cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + initial_layout); + const enum isl_aux_state final_state = + anv_layout_to_aux_state(&cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + final_layout); + + const bool initial_depth_valid = + isl_aux_state_has_valid_primary(initial_state); + const bool initial_hiz_valid = + isl_aux_state_has_valid_aux(initial_state); + const bool final_needs_depth = + isl_aux_state_has_valid_primary(final_state); + const bool final_needs_hiz = + isl_aux_state_has_valid_aux(final_state); + + /* Getting into the pass-through state for Depth is tricky and involves + * both a resolve and an ambiguate. We don't handle that state right now + * as anv_layout_to_aux_state never returns it. + */ + assert(final_state != ISL_AUX_STATE_PASS_THROUGH); - if (hiz_op != ISL_AUX_OP_NONE) + if (final_needs_depth && !initial_depth_valid) { + assert(initial_hiz_valid); anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, - 0, 0, 1, hiz_op); + 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE); + } else if (final_needs_hiz && !initial_hiz_valid) { + assert(initial_depth_valid); + anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE); + } +} + +static inline bool +vk_image_layout_stencil_write_optimal(VkImageLayout layout) +{ + return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR; } -#define MI_PREDICATE_SRC0 0x2400 -#define MI_PREDICATE_SRC1 0x2408 +/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless + * the initial layout is undefined, the HiZ buffer and depth buffer will + * represent the same data at the end of this operation. + */ +static void +transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout) +{ +#if GEN_GEN == 7 + uint32_t plane = anv_image_aspect_to_plane(image->aspects, + VK_IMAGE_ASPECT_STENCIL_BIT); + + /* On gen7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in following + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. + */ + if (image->planes[plane].shadow_surface.isl.size_B > 0 && + vk_image_layout_stencil_write_optimal(initial_layout) && + !vk_image_layout_stencil_write_optimal(final_layout)) { + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + base_level, level_count, + base_layer, layer_count); + } +#endif /* GEN_GEN == 7 */ +} + +#define MI_PREDICATE_SRC0 0x2400 +#define MI_PREDICATE_SRC1 0x2408 +#define MI_PREDICATE_RESULT 0x2418 static void set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, @@ -526,25 +706,6 @@ set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); } -#if GEN_IS_HASWELL || GEN_GEN >= 8 -static inline uint32_t -mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2) -{ - struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { - .ALUOpcode = opcode, - .Operand1 = operand1, - .Operand2 = operand2, - }; - - uint32_t dw; - GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); - - return dw; -} -#endif - -#define CS_GPR(n) (0x2600 + (n) * 8) - /* This is only really practical on haswell and above because it requires * MI math in order to get it correct. */ @@ -557,15 +718,12 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported) { - struct anv_address fast_clear_type_addr = - anv_image_get_fast_clear_type_addr(cmd_buffer->device, image, aspect); + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); - /* Name some registers */ - const int image_fc_reg = MI_ALU_REG0; - const int fc_imm_reg = MI_ALU_REG1; - const int pred_reg = MI_ALU_REG2; - - uint32_t *dw; + const struct gen_mi_value fast_clear_type = + gen_mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect)); if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) { /* In this case, we're doing a full resolve which means we want the @@ -576,17 +734,13 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, * if the first slice has been fast-cleared, it is also marked as * compressed. See also set_image_fast_clear_state. */ - struct anv_address compression_state_addr = - anv_image_get_compression_state_addr(cmd_buffer->device, image, - aspect, level, array_layer); - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = MI_PREDICATE_SRC0; - lrm.MemoryAddress = compression_state_addr; - } - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { - sdi.Address = compression_state_addr; - sdi.ImmediateData = 0; - } + const struct gen_mi_value compression_state = + gen_mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device, + image, aspect, + level, array_layer)); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + compression_state); + gen_mi_store(&b, compression_state, gen_mi_imm(0)); if (level == 0 && array_layer == 0) { /* If the predicate is true, we want to write 0 to the fast clear type @@ -594,25 +748,10 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, * * clear_type = clear_type & ~predicate; */ - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = CS_GPR(image_fc_reg); - lrm.MemoryAddress = fast_clear_type_addr; - } - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_REG), lrr) { - lrr.DestinationRegisterAddress = CS_GPR(pred_reg); - lrr.SourceRegisterAddress = MI_PREDICATE_SRC0; - } - - dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); - dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); - dw[3] = mi_alu(MI_ALU_AND, 0, 0); - dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); - - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.MemoryAddress = fast_clear_type_addr; - srm.RegisterAddress = CS_GPR(image_fc_reg); - } + struct gen_mi_value new_fast_clear_type = + gen_mi_iand(&b, fast_clear_type, + gen_mi_inot(&b, gen_mi_reg64(MI_PREDICATE_SRC0))); + gen_mi_store(&b, fast_clear_type, new_fast_clear_type); } } else if (level == 0 && array_layer == 0) { /* In this case, we are doing a partial resolve to get rid of fast-clear @@ -622,42 +761,20 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); assert(fast_clear_supported < ANV_FAST_CLEAR_ANY); - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = CS_GPR(image_fc_reg); - lrm.MemoryAddress = fast_clear_type_addr; - } - emit_lri(&cmd_buffer->batch, CS_GPR(image_fc_reg) + 4, 0); - - emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg), fast_clear_supported); - emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg) + 4, 0); - - /* We need to compute (fast_clear_supported < image->fast_clear). - * We do this by subtracting and storing the carry bit. - */ - dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, fc_imm_reg); - dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, image_fc_reg); - dw[3] = mi_alu(MI_ALU_SUB, 0, 0); - dw[4] = mi_alu(MI_ALU_STORE, pred_reg, MI_ALU_CF); - - /* Store the predicate */ - emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(pred_reg)); + /* We need to compute (fast_clear_supported < image->fast_clear) */ + struct gen_mi_value pred = + gen_mi_ult(&b, gen_mi_imm(fast_clear_supported), fast_clear_type); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + gen_mi_value_ref(&b, pred)); /* If the predicate is true, we want to write 0 to the fast clear type * and, if it's false, leave it alone. We can do this by writing * * clear_type = clear_type & ~predicate; */ - dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); - dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); - dw[3] = mi_alu(MI_ALU_AND, 0, 0); - dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); - - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.RegisterAddress = CS_GPR(image_fc_reg); - srm.MemoryAddress = fast_clear_type_addr; - } + struct gen_mi_value new_fast_clear_type = + gen_mi_iand(&b, fast_clear_type, gen_mi_inot(&b, pred)); + gen_mi_store(&b, fast_clear_type, new_fast_clear_type); } else { /* In this case, we're trying to do a partial resolve on a slice that * doesn't have clear color. There's nothing to do. @@ -666,13 +783,8 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, return; } - /* We use the first half of src0 for the actual predicate. Set the second - * half of src0 and all of src1 to 0 as the predicate operation will be - * doing an implicit src0 != src1. - */ - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); + /* Set src1 to 0 and use a != condition */ + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOADINV; @@ -691,8 +803,12 @@ anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported) { - struct anv_address fast_clear_type_addr = - anv_image_get_fast_clear_type_addr(cmd_buffer->device, image, aspect); + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + struct gen_mi_value fast_clear_type_mem = + gen_mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect)); /* This only works for partial resolves and only when the clear color is * all or nothing. On the upside, this emits less command streamer code @@ -709,22 +825,9 @@ anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, * can't sample from CCS surfaces. It's enough to just load the fast clear * state into the predicate register. */ - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = MI_PREDICATE_SRC0; - lrm.MemoryAddress = fast_clear_type_addr; - } - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { - sdi.Address = fast_clear_type_addr; - sdi.ImmediateData = 0; - } - - /* We use the first half of src0 for the actual predicate. Set the second - * half of src0 and all of src1 to 0 as the predicate operation will be - * doing an implicit src0 != src1. - */ - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); + gen_mi_store(&b, fast_clear_type_mem, gen_mi_imm(0)); anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOADINV; @@ -738,6 +841,7 @@ static void anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, + struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, @@ -759,17 +863,18 @@ anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, * to do a partial resolve on a CCS_D surface. */ if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && - image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) resolve_op = ISL_AUX_OP_FULL_RESOLVE; - anv_image_ccs_op(cmd_buffer, image, format, aspect, level, - array_layer, 1, resolve_op, NULL, true); + anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect, + level, array_layer, 1, resolve_op, NULL, true); } static void anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, + struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t array_layer, enum isl_aux_op resolve_op, @@ -783,7 +888,7 @@ anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, aspect, 0, array_layer, resolve_op, fast_clear_supported); - anv_image_mcs_op(cmd_buffer, image, format, aspect, + anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect, array_layer, 1, resolve_op, NULL, true); #else unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail"); @@ -826,27 +931,21 @@ init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, set_image_fast_clear_state(cmd_buffer, image, aspect, ANV_FAST_CLEAR_NONE); - /* The fast clear value dword(s) will be copied into a surface state object. - * Ensure that the restrictions of the fields in the dword(s) are followed. - * - * CCS buffers on SKL+ can have any value set for the clear colors. - */ - if (image->samples == 1 && GEN_GEN >= 9) - return; - - /* Other combinations of auxiliary buffers and platforms require specific - * values in the clear value dword(s). + /* Initialize the struct fields that are accessed for fast-clears so that + * the HW restrictions on the field values are satisfied. */ struct anv_address addr = anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); if (GEN_GEN >= 9) { - for (unsigned i = 0; i < 4; i++) { + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + const unsigned num_dwords = GEN_GEN >= 10 ? + isl_dev->ss.clear_color_state_size / 4 : + isl_dev->ss.clear_value_size / 4; + for (unsigned i = 0; i < num_dwords; i++) { anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = addr; sdi.Address.offset += i * 4; - /* MCS buffers on SKL+ can only have 1/0 clear colors. */ - assert(image->samples > 1); sdi.ImmediateData = 0; } } @@ -894,12 +993,35 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; +#if GEN_GEN == 7 + /* On gen7, the combination of commands used here(MI_LOAD_REGISTER_MEM + * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is + * in-flight when they are issued even if the memory touched is not + * currently active for rendering. The weird bit is that it is not the + * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight + * rendering hangs such that the next stalling command after the + * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang. + * + * It is unclear exactly why this hang occurs. Both MI commands come with + * warnings about the 3D pipeline but that doesn't seem to fully explain + * it. My (Jason's) best theory is that it has something to do with the + * fact that we're using a GPU state register as our temporary and that + * something with reading/writing it is causing problems. + * + * In order to work around this issue, we emit a PIPE_CONTROL with the + * command streamer stall bit set. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +#endif + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + if (copy_from_surface_state) { - genX(cmd_buffer_mi_memcpy)(cmd_buffer, entry_addr, - ss_clear_addr, copy_size); + gen_mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size); } else { - genX(cmd_buffer_mi_memcpy)(cmd_buffer, ss_clear_addr, - entry_addr, copy_size); + gen_mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size); /* Updating a surface state object may require that the state cache be * invalidated. From the SKL PRM, Shared Functions -> State -> State @@ -937,7 +1059,8 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, VkImageLayout initial_layout, VkImageLayout final_layout) { - const struct gen_device_info *devinfo = &cmd_buffer->device->info; + struct anv_device *device = cmd_buffer->device; + const struct gen_device_info *devinfo = &device->info; /* Validate the inputs. */ assert(cmd_buffer); assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); @@ -945,7 +1068,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, assert(level_count != VK_REMAINING_MIP_LEVELS && layer_count != VK_REMAINING_ARRAY_LAYERS); /* Ensure the subresource range is valid. */ - uint64_t last_level_num = base_level + level_count; + UNUSED uint64_t last_level_num = base_level + level_count; const uint32_t max_depth = anv_minify(image->extent.depth, base_level); UNUSED const uint32_t image_layers = MAX2(image->array_size, max_depth); assert((uint64_t)base_layer + layer_count <= image_layers); @@ -974,6 +1097,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, assert(isl_format_is_compressed(image->planes[plane].surface.isl.format)); assert(plane == 0); anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_COLOR_BIT, base_level, level_count, base_layer, layer_count); } @@ -981,10 +1105,20 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) return; - assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); + assert(image->planes[plane].surface.isl.tiling != ISL_TILING_LINEAR); if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { +#if GEN_GEN == 12 + if (device->physical->has_implicit_ccs && devinfo->has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, aspect, + base_level, level_count, + base_layer, layer_count); + } +#else + assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map)); +#endif + /* A subresource in the undefined layout may have been aliased and * populated with any arrangement of bits. Therefore, we must initialize * the related aux buffer and clear buffer entry with desirable values. @@ -1041,6 +1175,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, anv_image_ccs_op(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, level, base_layer, level_layer_count, ISL_AUX_OP_AMBIGUATE, NULL, false); @@ -1052,7 +1187,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, } } else { if (image->samples == 4 || image->samples == 16) { - anv_perf_warn(cmd_buffer->device->instance, image, + anv_perf_warn(cmd_buffer->device, image, "Doing a potentially unnecessary fast-clear to " "define an MCS buffer."); } @@ -1060,6 +1195,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, assert(base_level == 0 && level_count == 1); anv_image_mcs_op(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, base_layer, layer_count, ISL_AUX_OP_FAST_CLEAR, NULL, false); } @@ -1067,9 +1203,9 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, } const enum isl_aux_usage initial_aux_usage = - anv_layout_to_aux_usage(devinfo, image, aspect, initial_layout); + anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout); const enum isl_aux_usage final_aux_usage = - anv_layout_to_aux_usage(devinfo, image, aspect, final_layout); + anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout); /* The current code assumes that there is no mixing of CCS_E and CCS_D. * We can handle transitions between CCS_D/E to and from NONE. What we @@ -1123,7 +1259,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, * we do any more rendering or clearing. */ cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; for (uint32_t l = 0; l < level_count; l++) { uint32_t level = base_level + l; @@ -1139,6 +1275,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, if (image->samples == 1) { anv_cmd_predicated_ccs_resolve(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, level, array_layer, resolve_op, final_fast_clear); } else { @@ -1152,6 +1289,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, anv_cmd_predicated_mcs_resolve(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, array_layer, resolve_op, final_fast_clear); } @@ -1159,27 +1297,24 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, } cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; } -/** - * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. - */ static VkResult genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, - struct anv_render_pass *pass, + const struct anv_render_pass *pass, + const struct anv_framebuffer *framebuffer, const VkRenderPassBeginInfo *begin) { - const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; struct anv_cmd_state *state = &cmd_buffer->state; vk_free(&cmd_buffer->pool->alloc, state->attachments); if (pass->attachment_count > 0) { - state->attachments = vk_alloc(&cmd_buffer->pool->alloc, - pass->attachment_count * - sizeof(state->attachments[0]), - 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + state->attachments = vk_zalloc(&cmd_buffer->pool->alloc, + pass->attachment_count * + sizeof(state->attachments[0]), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (state->attachments == NULL) { /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ return anv_batch_set_error(&cmd_buffer->batch, @@ -1189,135 +1324,89 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, state->attachments = NULL; } - /* Reserve one for the NULL state. */ - unsigned num_states = 1; - for (uint32_t i = 0; i < pass->attachment_count; ++i) { - if (vk_format_is_color(pass->attachments[i].format)) - num_states++; + const VkRenderPassAttachmentBeginInfoKHR *attach_begin = + vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR); + if (begin && !attach_begin) + assert(pass->attachment_count == framebuffer->attachment_count); - if (need_input_attachment_state(&pass->attachments[i])) - num_states++; + for (uint32_t i = 0; i < pass->attachment_count; ++i) { + if (attach_begin && attach_begin->attachmentCount != 0) { + assert(attach_begin->attachmentCount == pass->attachment_count); + ANV_FROM_HANDLE(anv_image_view, iview, attach_begin->pAttachments[i]); + state->attachments[i].image_view = iview; + } else if (framebuffer && i < framebuffer->attachment_count) { + state->attachments[i].image_view = framebuffer->attachments[i]; + } else { + state->attachments[i].image_view = NULL; + } } - const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); - state->render_pass_states = - anv_state_stream_alloc(&cmd_buffer->surface_state_stream, - num_states * ss_stride, isl_dev->ss.align); - - struct anv_state next_state = state->render_pass_states; - next_state.alloc_size = isl_dev->ss.size; - - state->null_surface_state = next_state; - next_state.offset += ss_stride; - next_state.map += ss_stride; - - for (uint32_t i = 0; i < pass->attachment_count; ++i) { - if (vk_format_is_color(pass->attachments[i].format)) { - state->attachments[i].color.state = next_state; - next_state.offset += ss_stride; - next_state.map += ss_stride; - } - - if (need_input_attachment_state(&pass->attachments[i])) { - state->attachments[i].input.state = next_state; - next_state.offset += ss_stride; - next_state.map += ss_stride; - } - } - assert(next_state.offset == state->render_pass_states.offset + - state->render_pass_states.alloc_size); - - if (begin) { - ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer); - assert(pass->attachment_count == framebuffer->attachment_count); - - isl_null_fill_state(isl_dev, state->null_surface_state.map, - isl_extent3d(framebuffer->width, - framebuffer->height, - framebuffer->layers)); - - for (uint32_t i = 0; i < pass->attachment_count; ++i) { - struct anv_render_pass_attachment *att = &pass->attachments[i]; - VkImageAspectFlags att_aspects = vk_format_aspects(att->format); - VkImageAspectFlags clear_aspects = 0; - VkImageAspectFlags load_aspects = 0; + if (begin) { + for (uint32_t i = 0; i < pass->attachment_count; ++i) { + const struct anv_render_pass_attachment *pass_att = &pass->attachments[i]; + struct anv_attachment_state *att_state = &state->attachments[i]; + VkImageAspectFlags att_aspects = vk_format_aspects(pass_att->format); + VkImageAspectFlags clear_aspects = 0; + VkImageAspectFlags load_aspects = 0; if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { /* color attachment */ - if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; - } else if (att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { + } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; } } else { /* depthstencil attachment */ if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { - if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; - } else if (att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { + } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; } } if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { - if (att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; - } else if (att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { + } else if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; } } } - state->attachments[i].current_layout = att->initial_layout; - state->attachments[i].pending_clear_aspects = clear_aspects; - state->attachments[i].pending_load_aspects = load_aspects; + att_state->current_layout = pass_att->initial_layout; + att_state->current_stencil_layout = pass_att->stencil_initial_layout; + att_state->pending_clear_aspects = clear_aspects; + att_state->pending_load_aspects = load_aspects; if (clear_aspects) - state->attachments[i].clear_value = begin->pClearValues[i]; + att_state->clear_value = begin->pClearValues[i]; - struct anv_image_view *iview = framebuffer->attachments[i]; - anv_assert(iview->vk_format == att->format); + struct anv_image_view *iview = state->attachments[i].image_view; + anv_assert(iview->vk_format == pass_att->format); const uint32_t num_layers = iview->planes[0].isl.array_len; - state->attachments[i].pending_clear_views = (1 << num_layers) - 1; + att_state->pending_clear_views = (1 << num_layers) - 1; - union isl_color_value clear_color = { .u32 = { 0, } }; - if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { - anv_assert(iview->n_planes == 1); - assert(att_aspects == VK_IMAGE_ASPECT_COLOR_BIT); - color_attachment_compute_aux_usage(cmd_buffer->device, - state, i, begin->renderArea, - &clear_color); - - anv_image_fill_surface_state(cmd_buffer->device, - iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - &iview->planes[0].isl, - ISL_SURF_USAGE_RENDER_TARGET_BIT, - state->attachments[i].aux_usage, - &clear_color, - 0, - &state->attachments[i].color, - NULL); - - add_surface_state_relocs(cmd_buffer, state->attachments[i].color); - } else { - depth_stencil_attachment_compute_aux_usage(cmd_buffer->device, - state, i, - begin->renderArea); - } + /* This will be initialized after the first subpass transition. */ + att_state->aux_usage = ISL_AUX_USAGE_NONE; - if (need_input_attachment_state(&pass->attachments[i])) { - anv_image_fill_surface_state(cmd_buffer->device, - iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - &iview->planes[0].isl, - ISL_SURF_USAGE_TEXTURE_BIT, - state->attachments[i].input_aux_usage, - &clear_color, - 0, - &state->attachments[i].input, - NULL); - - add_surface_state_relocs(cmd_buffer, state->attachments[i].input); + att_state->fast_clear = false; + if (clear_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + assert(clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); + att_state->fast_clear = + anv_can_fast_clear_color_view(cmd_buffer->device, iview, + pass_att->first_subpass_layout, + vk_to_isl_color(att_state->clear_value.color), + framebuffer->layers, + begin->renderArea); + } else if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)) { + att_state->fast_clear = + anv_can_hiz_clear_ds_view(cmd_buffer->device, iview, + pass_att->first_subpass_layout, + clear_aspects, + att_state->clear_value.depthStencil.depth, + begin->renderArea); } } } @@ -1325,6 +1414,82 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, return VK_SUCCESS; } +/** + * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. + */ +static VkResult +genX(cmd_buffer_alloc_att_surf_states)(struct anv_cmd_buffer *cmd_buffer, + const struct anv_render_pass *pass, + const struct anv_subpass *subpass) +{ + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + struct anv_cmd_state *state = &cmd_buffer->state; + + /* Reserve one for the NULL state. */ + unsigned num_states = 1; + for (uint32_t i = 0; i < subpass->attachment_count; i++) { + uint32_t att = subpass->attachments[i].attachment; + if (att == VK_ATTACHMENT_UNUSED) + continue; + + assert(att < pass->attachment_count); + if (!vk_format_is_color(pass->attachments[att].format)) + continue; + + const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; + assert(util_bitcount(att_usage) == 1); + + if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT || + att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) + num_states++; + } + + const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); + state->attachment_states = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + num_states * ss_stride, isl_dev->ss.align); + if (state->attachment_states.map == NULL) { + return anv_batch_set_error(&cmd_buffer->batch, + VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + struct anv_state next_state = state->attachment_states; + next_state.alloc_size = isl_dev->ss.size; + + state->null_surface_state = next_state; + next_state.offset += ss_stride; + next_state.map += ss_stride; + + for (uint32_t i = 0; i < subpass->attachment_count; i++) { + uint32_t att = subpass->attachments[i].attachment; + if (att == VK_ATTACHMENT_UNUSED) + continue; + + assert(att < pass->attachment_count); + if (!vk_format_is_color(pass->attachments[att].format)) + continue; + + const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; + assert(util_bitcount(att_usage) == 1); + + if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) + state->attachments[att].color.state = next_state; + else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) + state->attachments[att].input.state = next_state; + else + continue; + + state->attachments[att].color.state = next_state; + next_state.offset += ss_stride; + next_state.map += ss_stride; + } + + assert(next_state.offset == state->attachment_states.offset + + state->attachment_states.alloc_size); + + return VK_SUCCESS; +} + VkResult genX(BeginCommandBuffer)( VkCommandBuffer commandBuffer, @@ -1363,9 +1528,18 @@ genX(BeginCommandBuffer)( * executing anything. The chances are fairly high that they will use * blorp at least once per primary command buffer so it shouldn't be * wasted. + * + * There is also a workaround on gen8 which requires us to invalidate the + * VF cache occasionally. It's easier if we can assume we start with a + * fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).) + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + + /* Re-emit the aux table register in every command buffer. This way we're + * ensured that we have the table even if this command buffer doesn't + * initialize any images. */ - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; /* We send an "Indirect State Pointers Disable" packet at * EndCommandBuffer, so all push contant packets are ignored during a @@ -1379,17 +1553,28 @@ genX(BeginCommandBuffer)( if (cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { assert(pBeginInfo->pInheritanceInfo); - cmd_buffer->state.pass = - anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); - cmd_buffer->state.subpass = - &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + ANV_FROM_HANDLE(anv_render_pass, pass, + pBeginInfo->pInheritanceInfo->renderPass); + struct anv_subpass *subpass = + &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + ANV_FROM_HANDLE(anv_framebuffer, framebuffer, + pBeginInfo->pInheritanceInfo->framebuffer); + + cmd_buffer->state.pass = pass; + cmd_buffer->state.subpass = subpass; /* This is optional in the inheritance info. */ - cmd_buffer->state.framebuffer = - anv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); + cmd_buffer->state.framebuffer = framebuffer; - result = genX(cmd_buffer_setup_attachments)(cmd_buffer, - cmd_buffer->state.pass, NULL); + result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, + framebuffer, NULL); + if (result != VK_SUCCESS) + return result; + + result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, pass, + subpass); + if (result != VK_SUCCESS) + return result; /* Record that HiZ is enabled if we can. */ if (cmd_buffer->state.framebuffer) { @@ -1402,15 +1587,30 @@ genX(BeginCommandBuffer)( enum isl_aux_usage aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, - VK_IMAGE_ASPECT_DEPTH_BIT, layout); + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + layout); - cmd_buffer->state.hiz_enabled = aux_usage == ISL_AUX_USAGE_HIZ; + cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(aux_usage); } } cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; } +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info = + vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT); + + /* If secondary buffer supports conditional rendering + * we should emit commands as if conditional rendering is enabled. + */ + cmd_buffer->state.conditional_render_enabled = + conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable; + } +#endif + return result; } @@ -1515,6 +1715,21 @@ genX(CmdExecuteCommands)( assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); assert(!anv_batch_has_error(&secondary->batch)); +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (secondary->state.conditional_render_enabled) { + if (!primary->state.conditional_render_enabled) { + /* Secondary buffer is constructed as if it will be executed + * with conditional rendering, we should satisfy this dependency + * regardless of conditional rendering being enabled in primary. + */ + struct gen_mi_builder b; + gen_mi_builder_init(&b, &primary->batch); + gen_mi_store(&b, gen_mi_reg64(ANV_PREDICATE_RESULT_REG), + gen_mi_imm(UINT64_MAX)); + } + } +#endif + if (secondary->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { /* If we're continuing a render pass from the primary, we need to @@ -1523,8 +1738,8 @@ genX(CmdExecuteCommands)( */ struct anv_bo *ss_bo = primary->device->surface_state_pool.block_pool.bo; - struct anv_state src_state = primary->state.render_pass_states; - struct anv_state dst_state = secondary->state.render_pass_states; + struct anv_state src_state = primary->state.attachment_states; + struct anv_state dst_state = secondary->state.attachment_states; assert(src_state.alloc_size == dst_state.alloc_size); genX(cmd_buffer_so_memcpy)(primary, @@ -1542,6 +1757,14 @@ genX(CmdExecuteCommands)( anv_cmd_buffer_add_secondary(primary, secondary); } + /* The secondary isn't counted in our VF cache tracking so we need to + * invalidate the whole thing. + */ + if (GEN_GEN >= 8 && GEN_GEN <= 9) { + primary->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } + /* The secondary may have selected a different pipeline (3D or compute) and * may have changed the current L3$ configuration. Reset our tracking * variables to invalid values to ensure that we re-emit these in the case @@ -1550,6 +1773,7 @@ genX(CmdExecuteCommands)( */ primary->state.current_pipeline = UINT32_MAX; primary->state.current_l3_config = NULL; + primary->state.current_hash_scale = 0; /* Each of the secondary command buffers will use its own state base * address. We need to re-emit state base address for the primary after @@ -1581,7 +1805,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, gen_dump_l3_config(cfg, stderr); } - const bool has_slm = cfg->n[GEN_L3P_SLM]; + UNUSED const bool has_slm = cfg->n[GEN_L3P_SLM]; /* According to the hardware docs, the L3 partitioning can only be changed * while the pipeline is completely drained and the caches are flushed, @@ -1628,9 +1852,19 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]); +#if GEN_GEN >= 12 +#define L3_ALLOCATION_REG GENX(L3ALLOC) +#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num) +#else +#define L3_ALLOCATION_REG GENX(L3CNTLREG) +#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num) +#endif + uint32_t l3cr; - anv_pack_struct(&l3cr, GENX(L3CNTLREG), + anv_pack_struct(&l3cr, L3_ALLOCATION_REG, +#if GEN_GEN < 11 .SLMEnable = has_slm, +#endif #if GEN_GEN == 11 /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set * in L3CNTLREG register. The default setting of the bit is not the @@ -1645,7 +1879,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, .AllAllocation = cfg->n[GEN_L3P_ALL]); /* Set up the L3 partitioning. */ - emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr); + emit_lri(&cmd_buffer->batch, L3_ALLOCATION_REG_num, l3cr); #else @@ -1669,7 +1903,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]); /* Minimum number of ways that can be allocated to the URB. */ - MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; + const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; assert(cfg->n[GEN_L3P_URB] >= n0_urb); uint32_t l3sqcr1, l3cr2, l3cr3; @@ -1707,7 +1941,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3); #if GEN_IS_HASWELL - if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) { + if (cmd_buffer->device->physical->cmd_parser_version >= 4) { /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep * them disabled to avoid crashing the system hard. */ @@ -1730,35 +1964,173 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) { + UNUSED const struct gen_device_info *devinfo = &cmd_buffer->device->info; enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; - /* Flushes are pipelined while invalidations are handled immediately. - * Therefore, if we're flushing anything then we need to schedule a stall - * before any invalidations can happen. + if (cmd_buffer->device->physical->always_flush_cache) + bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; + + /* + * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": + * + * Write synchronization is a special case of end-of-pipe + * synchronization that requires that the render cache and/or depth + * related caches are flushed to memory, where the data will become + * globally visible. This type of synchronization is required prior to + * SW (CPU) actually reading the result data from memory, or initiating + * an operation that will use as a read surface (such as a texture + * surface) a previous render target and/or depth/stencil buffer + * + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Exercising the write cache flush bits (Render Target Cache Flush + * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only + * ensures the write caches are flushed and doesn't guarantee the data + * is globally visible. + * + * SW can track the completion of the end-of-pipe-synchronization by + * using "Notify Enable" and "PostSync Operation - Write Immediate + * Data" in the PIPE_CONTROL command. + * + * In other words, flushes are pipelined while invalidations are handled + * immediately. Therefore, if we're flushing anything then we need to + * schedule an end-of-pipe sync before any invalidations can happen. */ if (bits & ANV_PIPE_FLUSH_BITS) - bits |= ANV_PIPE_NEEDS_CS_STALL_BIT; + bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; + + + /* HSD 1209978178: docs say that before programming the aux table: + * + * "Driver must ensure that the engine is IDLE but ensure it doesn't + * add extra flushes in the case it knows that the engine is already + * IDLE." + */ + if (GEN_GEN == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) + bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; - /* If we're going to do an invalidate and we have a pending CS stall that - * has yet to be resolved, we do the CS stall now. + /* If we're going to do an invalidate and we have a pending end-of-pipe + * sync that has yet to be resolved, we do the end-of-pipe sync now. */ if ((bits & ANV_PIPE_INVALIDATE_BITS) && - (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) { - bits |= ANV_PIPE_CS_STALL_BIT; - bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT; + (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) { + bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; + } + + if (GEN_GEN >= 12 && + ((bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT) || + (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT))) { + /* From the PIPE_CONTROL instruction table, bit 28 (Tile Cache Flush + * Enable): + * + * Unified Cache (Tile Cache Disabled): + * + * When the Color and Depth (Z) streams are enabled to be cached in + * the DC space of L2, Software must use "Render Target Cache Flush + * Enable" and "Depth Cache Flush Enable" along with "Tile Cache + * Flush" for getting the color and depth (Z) write data to be + * globally observable. In this mode of operation it is not required + * to set "CS Stall" upon setting "Tile Cache Flush" bit. + */ + bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + } + + /* GEN:BUG:1409226450, Wait for EU to be idle before pipe control which + * invalidates the instruction cache + */ + if (GEN_GEN == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)) + bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + + if ((GEN_GEN >= 8 && GEN_GEN <= 9) && + (bits & ANV_PIPE_CS_STALL_BIT) && + (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) { + /* If we are doing a VF cache invalidate AND a CS stall (it must be + * both) then we can reset our vertex cache tracking. + */ + memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0, + sizeof(cmd_buffer->state.gfx.vb_dirty_ranges)); + memset(&cmd_buffer->state.gfx.ib_dirty_range, 0, + sizeof(cmd_buffer->state.gfx.ib_dirty_range)); + } + + /* Project: SKL / Argument: LRI Post Sync Operation [23] + * + * "PIPECONTROL command with “Command Streamer Stall Enable” must be + * programmed prior to programming a PIPECONTROL command with "LRI + * Post Sync Operation" in GPGPU mode of operation (i.e when + * PIPELINE_SELECT command is set to GPGPU mode of operation)." + * + * The same text exists a few rows below for Post Sync Op. + * + * On Gen12 this is GEN:BUG:1607156449. + */ + if (bits & ANV_PIPE_POST_SYNC_BIT) { + if ((GEN_GEN == 9 || (GEN_GEN == 12 && devinfo->revision == 0 /* A0 */)) && + cmd_buffer->state.current_pipeline == GPGPU) + bits |= ANV_PIPE_CS_STALL_BIT; + bits &= ~ANV_PIPE_POST_SYNC_BIT; } - if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { + if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT)) { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT; +#endif pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; pipe.RenderTargetCacheFlushEnable = bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must + * be set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ +#if GEN_GEN >= 12 + pipe.DepthStallEnable = + pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT); +#else pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; +#endif + pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": + * + * "The most common action to perform upon reaching a + * synchronization point is to write a value out to memory. An + * immediate value (included with the synchronization command) may + * be written." + * + * + * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": + * + * "In case the data flushed out by the render engine is to be + * read back in to the render engine in coherent manner, then the + * render engine has to wait for the fence completion before + * accessing the flushed data. This can be achieved by following + * means on various products: PIPE_CONTROL command with CS Stall + * and the required write caches flushed with Post-Sync-Operation + * as Write Immediate Data. + * + * Example: + * - Workload-1 (3D/GPGPU/MEDIA) + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + */ + if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) { + pipe.CommandStreamerStallEnable = true; + pipe.PostSyncOperation = WriteImmediateData; + pipe.Address = (struct anv_address) { + .bo = cmd_buffer->device->workaround_bo, + .offset = 0 + }; + } + /* * According to the Broadwell documentation, any PIPE_CONTROL with the * "Command Streamer Stall" bit set must also have another bit set, @@ -1774,9 +2146,13 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) * I chose "Stall at Pixel Scoreboard" since that's what we use in * mesa and it seems to work fine. The choice is fairly arbitrary. */ - if ((bits & ANV_PIPE_CS_STALL_BIT) && - !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT | - ANV_PIPE_STALL_AT_SCOREBOARD_BIT))) + if (pipe.CommandStreamerStallEnable && + !pipe.RenderTargetCacheFlushEnable && + !pipe.DepthCacheFlushEnable && + !pipe.StallAtPixelScoreboard && + !pipe.PostSyncOperation && + !pipe.DepthStallEnable && + !pipe.DCFlushEnable) pipe.StallAtPixelScoreboard = true; } @@ -1784,9 +2160,53 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) * saying that render target writes are ongoing. */ if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) - bits &= ~(ANV_PIPE_RENDER_TARGET_WRITES); + bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES); + + if (GEN_IS_HASWELL) { + /* Haswell needs addition work-arounds: + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Option 1: + * PIPE_CONTROL command with the CS Stall and the required write + * caches flushed with Post-SyncOperation as Write Immediate Data + * followed by eight dummy MI_STORE_DATA_IMM (write to scratch + * spce) commands. + * + * Example: + * - Workload-1 + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + * + * Unfortunately, both the PRMs and the internal docs are a bit + * out-of-date in this regard. What the windows driver does (and + * this appears to actually work) is to emit a register read from the + * memory address written by the pipe control above. + * + * What register we load into doesn't matter. We choose an indirect + * rendering register because we know it always exists and it's one + * of the first registers the command parser allows us to write. If + * you don't have command parser support in your kernel (pre-4.2), + * this will get turned into MI_NOOP and you won't get the + * workaround. Unfortunately, there's just not much we can do in + * that case. This register is perfectly safe to write since we + * always re-load all of the indirect draw registers right before + * 3DPRIMITIVE when needed anyway. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = 0x243C; /* GEN7_3DPRIM_START_INSTANCE */ + lrm.MemoryAddress = (struct anv_address) { + .bo = cmd_buffer->device->workaround_bo, + .offset = 0 + }; + } + } - bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT); + bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT); } if (bits & ANV_PIPE_INVALIDATE_BITS) { @@ -1824,9 +2244,19 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) if (GEN_GEN == 9 && pipe.VFCacheInvalidationEnable) { pipe.PostSyncOperation = WriteImmediateData; pipe.Address = - (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; + } + } + +#if GEN_GEN == 12 + if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && + cmd_buffer->device->info.has_aux_map) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num); + lri.DataDWord = 1; } } +#endif bits &= ~ANV_PIPE_INVALIDATE_BITS; } @@ -1872,24 +2302,35 @@ void genX(CmdPipelineBarrier)( const VkImageSubresourceRange *range = &pImageMemoryBarriers[i].subresourceRange; + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->extent.depth, range->baseMipLevel); + } else { + base_layer = range->baseArrayLayer; + layer_count = anv_get_layerCount(image, range); + } + if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, + base_layer, layer_count, pImageMemoryBarriers[i].oldLayout, pImageMemoryBarriers[i].newLayout); - } else if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + } + + if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + range->baseMipLevel, + anv_get_levelCount(image, range), + base_layer, layer_count, + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout); + } + + if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { VkImageAspectFlags color_aspects = anv_image_expand_aspects(image, range->aspectMask); uint32_t aspect_bit; - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(image->extent.depth, range->baseMipLevel); - } else { - base_layer = range->baseArrayLayer; - layer_count = anv_get_layerCount(image, range); - } - anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, range->baseMipLevel, @@ -1910,7 +2351,7 @@ static void cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) { VkShaderStageFlags stages = - cmd_buffer->state.gfx.base.pipeline->active_stages; + cmd_buffer->state.gfx.pipeline->active_stages; /* In order to avoid thrash, we assume that vertex and fragment stages * always exist. In the rare case where one is missing *and* the other @@ -1973,117 +2414,73 @@ cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; } -static const struct anv_descriptor * -anv_descriptor_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) +static struct anv_address +anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, + struct anv_descriptor_set *set) { - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - const uint32_t offset = - set->layout->binding[binding->binding].descriptor_index; - return &set->descriptors[offset + binding->index]; -} - -static uint32_t -dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) -{ - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - - uint32_t dynamic_offset_idx = - pipe_state->layout->set[binding->set].dynamic_offset_start + - set->layout->binding[binding->binding].dynamic_offset_index + - binding->index; + if (set->pool) { + /* This is a normal descriptor set */ + return (struct anv_address) { + .bo = set->pool->bo, + .offset = set->desc_mem.offset, + }; + } else { + /* This is a push descriptor set. We have to flag it as used on the GPU + * so that the next time we push descriptors, we grab a new memory. + */ + struct anv_push_descriptor_set *push_set = + (struct anv_push_descriptor_set *)set; + push_set->set_used_on_gpu = true; - return pipe_state->dynamic_offsets[dynamic_offset_idx]; + return (struct anv_address) { + .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo, + .offset = set->desc_mem.offset, + }; + } } static VkResult emit_binding_table(struct anv_cmd_buffer *cmd_buffer, - gl_shader_stage stage, + struct anv_cmd_pipeline_state *pipe_state, + struct anv_shader_bin *shader, struct anv_state *bt_state) { - const struct gen_device_info *devinfo = &cmd_buffer->device->info; struct anv_subpass *subpass = cmd_buffer->state.subpass; - struct anv_cmd_pipeline_state *pipe_state; - struct anv_pipeline *pipeline; - uint32_t bias, state_offset; - - switch (stage) { - case MESA_SHADER_COMPUTE: - pipe_state = &cmd_buffer->state.compute.base; - bias = 1; - break; - default: - pipe_state = &cmd_buffer->state.gfx.base; - bias = 0; - break; - } - pipeline = pipe_state->pipeline; - - if (!anv_pipeline_has_stage(pipeline, stage)) { - *bt_state = (struct anv_state) { 0, }; - return VK_SUCCESS; - } + uint32_t state_offset; - struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; - if (bias + map->surface_count == 0) { + struct anv_pipeline_bind_map *map = &shader->bind_map; + if (map->surface_count == 0) { *bt_state = (struct anv_state) { 0, }; return VK_SUCCESS; } *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, - bias + map->surface_count, + map->surface_count, &state_offset); uint32_t *bt_map = bt_state->map; if (bt_state->map == NULL) return VK_ERROR_OUT_OF_DEVICE_MEMORY; - if (stage == MESA_SHADER_COMPUTE && - get_cs_prog_data(pipeline)->uses_num_work_groups) { - struct anv_state surface_state; - surface_state = - anv_cmd_buffer_alloc_surface_state(cmd_buffer); - - const enum isl_format format = - anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, - format, - cmd_buffer->state.compute.num_workgroups, - 12, 1); - - bt_map[0] = surface_state.offset + state_offset; - add_surface_reloc(cmd_buffer, surface_state, - cmd_buffer->state.compute.num_workgroups); - } - - if (map->surface_count == 0) - goto out; - - /* We only use push constant space for images before gen9 */ - if (map->image_count > 0 && devinfo->gen < 9) { - VkResult result = - anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images); - if (result != VK_SUCCESS) - return result; - - cmd_buffer->state.push_constants_dirty |= 1 << stage; - } + /* We only need to emit relocs if we're not using softpin. If we are using + * softpin then we always keep all user-allocated memory objects resident. + */ + const bool need_client_mem_relocs = + !cmd_buffer->device->physical->use_softpin; - uint32_t image = 0; for (uint32_t s = 0; s < map->surface_count; s++) { struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; struct anv_state surface_state; - if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { + switch (binding->set) { + case ANV_DESCRIPTOR_SET_NULL: + bt_map[s] = 0; + break; + + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: /* Color attachment binding */ - assert(stage == MESA_SHADER_FRAGMENT); - assert(binding->binding == 0); + assert(shader->stage == MESA_SHADER_FRAGMENT); if (binding->index < subpass->color_count) { const unsigned att = subpass->color_attachments[binding->index].attachment; @@ -2103,18 +2500,19 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, surface_state = cmd_buffer->state.null_surface_state; } - bt_map[bias + s] = surface_state.offset + state_offset; - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; + break; + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: { struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); struct anv_address constant_data = { - .bo = pipeline->device->dynamic_state_pool.block_pool.bo, - .offset = pipeline->shaders[stage]->constant_data.offset, + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = shader->constant_data.offset, }; - unsigned constant_data_size = - pipeline->shaders[stage]->constant_data_size; + unsigned constant_data_size = shader->constant_data_size; const enum isl_format format = anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); @@ -2122,174 +2520,185 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, surface_state, format, constant_data, constant_data_size, 1); - bt_map[bias + s] = surface_state.offset + state_offset; + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, surface_state, constant_data); - continue; + break; } - const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); + case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { + /* This is always the first binding for compute shaders */ + assert(shader->stage == MESA_SHADER_COMPUTE && s == 0); - switch (desc->type) { - case VK_DESCRIPTOR_TYPE_SAMPLER: - /* Nothing for us to do here */ - continue; + struct anv_state surface_state = + anv_cmd_buffer_alloc_surface_state(cmd_buffer); - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { - struct anv_surface_state sstate = - (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? - desc->image_view->planes[binding->plane].general_sampler_surface_state : - desc->image_view->planes[binding->plane].optimal_sampler_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - add_surface_state_relocs(cmd_buffer, sstate); + const enum isl_format format = + anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, + cmd_buffer->state.compute.num_workgroups, + 12, 1); + + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + cmd_buffer->state.compute.num_workgroups); + } break; } - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - assert(stage == MESA_SHADER_FRAGMENT); - if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { - /* For depth and stencil input attachments, we treat it like any - * old texture that a user may have bound. - */ + + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + /* This is a descriptor set buffer so the set index is actually + * given by binding->binding. (Yes, that's confusing.) + */ + struct anv_descriptor_set *set = + pipe_state->descriptors[binding->index]; + assert(set->desc_mem.alloc_size); + assert(set->desc_surface_state.alloc_size); + bt_map[s] = set->desc_surface_state.offset + state_offset; + add_surface_reloc(cmd_buffer, set->desc_surface_state, + anv_descriptor_set_address(cmd_buffer, set)); + break; + } + + default: { + assert(binding->set < MAX_SETS); + const struct anv_descriptor *desc = + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; + + switch (desc->type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + /* Nothing for us to do here */ + continue; + + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { struct anv_surface_state sstate = (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? desc->image_view->planes[binding->plane].general_sampler_surface_state : desc->image_view->planes[binding->plane].optimal_sampler_surface_state; surface_state = sstate.state; assert(surface_state.alloc_size); - add_surface_state_relocs(cmd_buffer, sstate); - } else { - /* For color input attachments, we create the surface state at - * vkBeginRenderPass time so that we can include aux and clear - * color information. - */ - assert(binding->input_attachment_index < subpass->input_count); - const unsigned subpass_att = binding->input_attachment_index; - const unsigned att = subpass->input_attachments[subpass_att].attachment; - surface_state = cmd_buffer->state.attachments[att].input.state; + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + break; } - break; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + assert(shader->stage == MESA_SHADER_FRAGMENT); + if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { + /* For depth and stencil input attachments, we treat it like any + * old texture that a user may have bound. + */ + assert(desc->image_view->n_planes == 1); + struct anv_surface_state sstate = + (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? + desc->image_view->planes[0].general_sampler_surface_state : + desc->image_view->planes[0].optimal_sampler_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + } else { + /* For color input attachments, we create the surface state at + * vkBeginRenderPass time so that we can include aux and clear + * color information. + */ + assert(binding->input_attachment_index < subpass->input_count); + const unsigned subpass_att = binding->input_attachment_index; + const unsigned att = subpass->input_attachments[subpass_att].attachment; + surface_state = cmd_buffer->state.attachments[att].input.state; + } + break; - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { - struct anv_surface_state sstate = (binding->write_only) - ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state - : desc->image_view->planes[binding->plane].storage_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - add_surface_state_relocs(cmd_buffer, sstate); - if (devinfo->gen < 9) { - assert(image < MAX_GEN8_IMAGES); - struct brw_image_param *image_param = - &cmd_buffer->state.push_constants[stage]->images[image]; - - *image_param = - desc->image_view->planes[binding->plane].storage_image_param; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { + struct anv_surface_state sstate = (binding->write_only) + ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state + : desc->image_view->planes[binding->plane].storage_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + break; } - image++; - break; - } - - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - surface_state = desc->buffer_view->surface_state; - assert(surface_state.alloc_size); - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); - break; - - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { - /* Compute the offset within the buffer */ - uint32_t dynamic_offset = - dynamic_offset_for_binding(pipe_state, binding); - uint64_t offset = desc->offset + dynamic_offset; - /* Clamp to the buffer size */ - offset = MIN2(offset, desc->buffer->size); - /* Clamp the range to the buffer size */ - uint32_t range = MIN2(desc->range, desc->buffer->size - offset); - struct anv_address address = - anv_address_add(desc->buffer->address, offset); - - surface_state = - anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); - enum isl_format format = - anv_isl_format_for_descriptor_type(desc->type); + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + surface_state = desc->buffer_view->surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + /* Compute the offset within the buffer */ + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[shader->stage]; + + uint32_t dynamic_offset = + push->dynamic_offsets[binding->dynamic_offset_index]; + uint64_t offset = desc->offset + dynamic_offset; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->size); + /* Clamp the range to the buffer size */ + uint32_t range = MIN2(desc->range, desc->buffer->size - offset); + + /* Align the range for consistency */ + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) + range = align_u32(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT); + + struct anv_address address = + anv_address_add(desc->buffer->address, offset); + + surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + enum isl_format format = + anv_isl_format_for_descriptor_type(desc->type); + + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, address, range, 1); + if (need_client_mem_relocs) + add_surface_reloc(cmd_buffer, surface_state, address); + break; + } - anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, - format, address, range, 1); - add_surface_reloc(cmd_buffer, surface_state, address); - break; - } + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + surface_state = (binding->write_only) + ? desc->buffer_view->writeonly_storage_surface_state + : desc->buffer_view->storage_surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - surface_state = (binding->write_only) - ? desc->buffer_view->writeonly_storage_surface_state - : desc->buffer_view->storage_surface_state; - assert(surface_state.alloc_size); - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); - if (devinfo->gen < 9) { - assert(image < MAX_GEN8_IMAGES); - struct brw_image_param *image_param = - &cmd_buffer->state.push_constants[stage]->images[image]; - - *image_param = desc->buffer_view->storage_image_param; + default: + assert(!"Invalid descriptor type"); + continue; } - image++; + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; break; - - default: - assert(!"Invalid descriptor type"); - continue; } - - bt_map[bias + s] = surface_state.offset + state_offset; + } } - assert(image == map->image_count); - - out: -#if GEN_GEN >= 11 - /* The PIPE_CONTROL command description says: - * - * "Whenever a Binding Table Index (BTI) used by a Render Taget Message - * points to a different RENDER_SURFACE_STATE, SW must issue a Render - * Target Cache Flush by enabling this bit. When render target flush - * is set due to new association of BTI, PS Scoreboard Stall bit must - * be set in this packet." - * - * FINISHME: Currently we shuffle around the surface states in the binding - * table based on if they are getting used or not. So, we've to do below - * pipe control flush for every binding table upload. Make changes so - * that we do it only when we modify render target surface states. - */ - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.RenderTargetCacheFlushEnable = true; - pc.StallAtPixelScoreboard = true; - } -#endif return VK_SUCCESS; } static VkResult emit_samplers(struct anv_cmd_buffer *cmd_buffer, - gl_shader_stage stage, + struct anv_cmd_pipeline_state *pipe_state, + struct anv_shader_bin *shader, struct anv_state *state) { - struct anv_cmd_pipeline_state *pipe_state = - stage == MESA_SHADER_COMPUTE ? &cmd_buffer->state.compute.base : - &cmd_buffer->state.gfx.base; - struct anv_pipeline *pipeline = pipe_state->pipeline; - - if (!anv_pipeline_has_stage(pipeline, stage)) { - *state = (struct anv_state) { 0, }; - return VK_SUCCESS; - } - - struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; + struct anv_pipeline_bind_map *map = &shader->bind_map; if (map->sampler_count == 0) { *state = (struct anv_state) { 0, }; return VK_SUCCESS; @@ -2304,7 +2713,7 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, for (uint32_t s = 0; s < map->sampler_count; s++) { struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) @@ -2326,22 +2735,34 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, } static uint32_t -flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer) +flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipe_state, + struct anv_shader_bin **shaders, + uint32_t num_shaders) { - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; - - VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty & - pipeline->active_stages; + const VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty; + VkShaderStageFlags flushed = 0; VkResult result = VK_SUCCESS; - anv_foreach_stage(s, dirty) { - result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]); + for (uint32_t i = 0; i < num_shaders; i++) { + if (!shaders[i]) + continue; + + gl_shader_stage stage = shaders[i]->stage; + VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage); + if ((vk_stage & dirty) == 0) + continue; + + result = emit_samplers(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.samplers[stage]); if (result != VK_SUCCESS) break; - result = emit_binding_table(cmd_buffer, s, - &cmd_buffer->state.binding_tables[s]); + result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.binding_tables[stage]); if (result != VK_SUCCESS) break; + + flushed |= vk_stage; } if (result != VK_SUCCESS) { @@ -2357,25 +2778,34 @@ flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_emit_state_base_address)(cmd_buffer); /* Re-emit all active binding tables */ - dirty |= pipeline->active_stages; - anv_foreach_stage(s, dirty) { - result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]); + flushed = 0; + + for (uint32_t i = 0; i < num_shaders; i++) { + if (!shaders[i]) + continue; + + gl_shader_stage stage = shaders[i]->stage; + + result = emit_samplers(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.samplers[stage]); if (result != VK_SUCCESS) { anv_batch_set_error(&cmd_buffer->batch, result); return 0; } - result = emit_binding_table(cmd_buffer, s, - &cmd_buffer->state.binding_tables[s]); + result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.binding_tables[stage]); if (result != VK_SUCCESS) { anv_batch_set_error(&cmd_buffer->batch, result); return 0; } + + flushed |= mesa_to_vk_shader_stage(stage); } } - cmd_buffer->state.descriptors_dirty &= ~dirty; + cmd_buffer->state.descriptors_dirty &= ~flushed; - return dirty; + return flushed; } static void @@ -2422,12 +2852,120 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, } } +static struct anv_address +get_push_range_address(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_push_range *range) +{ + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + switch (range->set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + /* This is a descriptor set buffer so the set index is + * actually given by binding->binding. (Yes, that's + * confusing.) + */ + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->index]; + return anv_descriptor_set_address(cmd_buffer, set); + } + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: { + struct anv_state state = + anv_cmd_buffer_push_constants(cmd_buffer, stage); + return (struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = state.offset, + }; + } + + default: { + assert(range->set < MAX_SETS); + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->set]; + const struct anv_descriptor *desc = + &set->descriptors[range->index]; + + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + return desc->buffer_view->address; + } else { + assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + uint32_t dynamic_offset = + push->dynamic_offsets[range->dynamic_offset_index]; + return anv_address_add(desc->buffer->address, + desc->offset + dynamic_offset); + } + } + } +} + + +/** Returns the size in bytes of the bound buffer + * + * The range is relative to the start of the buffer, not the start of the + * range. The returned range may be smaller than + * + * (range->start + range->length) * 32; + */ +static uint32_t +get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_push_range *range) +{ + assert(stage != MESA_SHADER_COMPUTE); + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + switch (range->set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->index]; + assert(range->start * 32 < set->desc_mem.alloc_size); + assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size); + return set->desc_mem.alloc_size; + } + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: + return (range->start + range->length) * 32; + + default: { + assert(range->set < MAX_SETS); + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->set]; + const struct anv_descriptor *desc = + &set->descriptors[range->index]; + + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + return desc->buffer_view->range; + } else { + assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + /* Compute the offset within the buffer */ + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + uint32_t dynamic_offset = + push->dynamic_offsets[range->dynamic_offset_index]; + uint64_t offset = desc->offset + dynamic_offset; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->size); + /* Clamp the range to the buffer size */ + uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset); + + /* Align the range for consistency */ + bound_range = align_u32(bound_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT); + + return bound_range; + } + } + } +} + static void -cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, - VkShaderStageFlags dirty_stages) +cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + struct anv_address *buffers, + unsigned buffer_count) { const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; - const struct anv_pipeline *pipeline = gfx_state->base.pipeline; + const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; static const uint32_t push_constant_opcodes[] = { [MESA_SHADER_VERTEX] = 21, @@ -2438,145 +2976,258 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, [MESA_SHADER_COMPUTE] = 0, }; - VkShaderStageFlags flushed = 0; + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + assert(push_constant_opcodes[stage] > 0); - anv_foreach_stage(stage, dirty_stages) { - assert(stage < ARRAY_SIZE(push_constant_opcodes)); - assert(push_constant_opcodes[stage] > 0); + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { + c._3DCommandSubOpcode = push_constant_opcodes[stage]; - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { - c._3DCommandSubOpcode = push_constant_opcodes[stage]; + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + +#if GEN_GEN >= 12 + c.MOCS = cmd_buffer->device->isl_dev.mocs.internal; +#endif - if (anv_pipeline_has_stage(pipeline, stage)) { #if GEN_GEN >= 8 || GEN_IS_HASWELL - const struct brw_stage_prog_data *prog_data = - pipeline->shaders[stage]->prog_data; - const struct anv_pipeline_bind_map *bind_map = - &pipeline->shaders[stage]->bind_map; + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + assert(buffer_count <= 4); + const unsigned shift = 4 - buffer_count; + for (unsigned i = 0; i < buffer_count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; - /* The Skylake PRM contains the following restriction: - * - * "The driver must ensure The following case does not occur - * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with - * buffer 3 read length equal to zero committed followed by a - * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to - * zero committed." - * - * To avoid this, we program the buffers in the highest slots. - * This way, slot 0 is only used if slot 3 is also used. + /* At this point we only have non-empty ranges */ + assert(range->length > 0); + + /* For Ivy Bridge, make sure we only set the first range (actual + * push constants) */ - int n = 3; + assert((GEN_GEN >= 8 || GEN_IS_HASWELL) || i == 0); + + c.ConstantBody.ReadLength[i + shift] = range->length; + c.ConstantBody.Buffer[i + shift] = + anv_address_add(buffers[i], range->start * 32); + } +#else + /* For Ivy Bridge, push constants are relative to dynamic state + * base address and we only ever push actual push constants. + */ + if (bind_map->push_ranges[0].length > 0) { + assert(buffer_count == 1); + assert(bind_map->push_ranges[0].set == + ANV_DESCRIPTOR_SET_PUSH_CONSTANTS); + assert(buffers[0].bo == + cmd_buffer->device->dynamic_state_pool.block_pool.bo); + c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length; + c.ConstantBody.Buffer[0].bo = NULL; + c.ConstantBody.Buffer[0].offset = buffers[0].offset; + } + assert(bind_map->push_ranges[1].length == 0); + assert(bind_map->push_ranges[2].length == 0); + assert(bind_map->push_ranges[3].length == 0); +#endif + } + } +} + +#if GEN_GEN >= 12 +static void +cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, + uint32_t shader_mask, + struct anv_address *buffers, + uint32_t buffer_count) +{ + if (buffer_count == 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { + c.ShaderUpdateEnable = shader_mask; + c.MOCS = cmd_buffer->device->isl_dev.mocs.internal; + } + return; + } + + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; + + static const uint32_t push_constant_opcodes[] = { + [MESA_SHADER_VERTEX] = 21, + [MESA_SHADER_TESS_CTRL] = 25, /* HS */ + [MESA_SHADER_TESS_EVAL] = 26, /* DS */ + [MESA_SHADER_GEOMETRY] = 22, + [MESA_SHADER_FRAGMENT] = 23, + [MESA_SHADER_COMPUTE] = 0, + }; + + gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask); + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + assert(push_constant_opcodes[stage] > 0); + + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + + uint32_t *dw; + const uint32_t buffer_mask = (1 << buffer_count) - 1; + const uint32_t num_dwords = 2 + 2 * buffer_count; + + dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords, + GENX(3DSTATE_CONSTANT_ALL), + .ShaderUpdateEnable = shader_mask, + .PointerBufferMask = buffer_mask, + .MOCS = cmd_buffer->device->isl_dev.mocs.internal); + + for (int i = 0; i < buffer_count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + GENX(3DSTATE_CONSTANT_ALL_DATA_pack)( + &cmd_buffer->batch, dw + 2 + i * 2, + &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) { + .PointerToConstantBuffer = + anv_address_add(buffers[i], range->start * 32), + .ConstantBufferReadLength = range->length, + }); + } +} +#endif + +static void +cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, + VkShaderStageFlags dirty_stages) +{ + VkShaderStageFlags flushed = 0; + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; + +#if GEN_GEN >= 12 + uint32_t nobuffer_stages = 0; +#endif - for (int i = 3; i >= 0; i--) { - const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + anv_foreach_stage(stage, dirty_stages) { + unsigned buffer_count = 0; + flushed |= mesa_to_vk_shader_stage(stage); + UNUSED uint32_t max_push_range = 0; + + struct anv_address buffers[4] = {}; + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + + if (cmd_buffer->device->robust_buffer_access) { + push->push_reg_mask = 0; + /* Start of the current range in the shader, relative to the start + * of push constants in the shader. + */ + unsigned range_start_reg = 0; + for (unsigned i = 0; i < 4; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; if (range->length == 0) continue; - const unsigned surface = - prog_data->binding_table.ubo_start + range->block; - - assert(surface <= bind_map->surface_count); - const struct anv_pipeline_binding *binding = - &bind_map->surface_to_descriptor[surface]; - - struct anv_address read_addr; - uint32_t read_len; - if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { - struct anv_address constant_data = { - .bo = pipeline->device->dynamic_state_pool.block_pool.bo, - .offset = pipeline->shaders[stage]->constant_data.offset, - }; - unsigned constant_data_size = - pipeline->shaders[stage]->constant_data_size; - - read_len = MIN2(range->length, - DIV_ROUND_UP(constant_data_size, 32) - range->start); - read_addr = anv_address_add(constant_data, - range->start * 32); - } else { - const struct anv_descriptor *desc = - anv_descriptor_for_binding(&gfx_state->base, binding); - - if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { - read_len = MIN2(range->length, - DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start); - read_addr = anv_address_add(desc->buffer_view->address, - range->start * 32); - } else { - assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); - - uint32_t dynamic_offset = - dynamic_offset_for_binding(&gfx_state->base, binding); - uint32_t buf_offset = - MIN2(desc->offset + dynamic_offset, desc->buffer->size); - uint32_t buf_range = - MIN2(desc->range, desc->buffer->size - buf_offset); - - read_len = MIN2(range->length, - DIV_ROUND_UP(buf_range, 32) - range->start); - read_addr = anv_address_add(desc->buffer->address, - buf_offset + range->start * 32); - } - } - - if (read_len > 0) { - c.ConstantBody.Buffer[n] = read_addr; - c.ConstantBody.ReadLength[n] = read_len; - n--; + unsigned bound_size = + get_push_range_bound_size(cmd_buffer, stage, range); + if (bound_size >= range->start * 32) { + unsigned bound_regs = + MIN2(DIV_ROUND_UP(bound_size, 32) - range->start, + range->length); + assert(range_start_reg + bound_regs <= 64); + push->push_reg_mask |= BITFIELD64_RANGE(range_start_reg, + bound_regs); } - } - struct anv_state state = - anv_cmd_buffer_push_constants(cmd_buffer, stage); + cmd_buffer->state.push_constants_dirty |= + mesa_to_vk_shader_stage(stage); - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[n] = (struct anv_address) { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = state.offset, - }; - c.ConstantBody.ReadLength[n] = - DIV_ROUND_UP(state.alloc_size, 32); + range_start_reg += range->length; } -#else - /* For Ivy Bridge, the push constants packets have a different - * rule that would require us to iterate in the other direction - * and possibly mess around with dynamic state base address. - * Don't bother; just emit regular push constants at n = 0. - */ - struct anv_state state = - anv_cmd_buffer_push_constants(cmd_buffer, stage); + } - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[0].offset = state.offset, - c.ConstantBody.ReadLength[0] = - DIV_ROUND_UP(state.alloc_size, 32); - } -#endif + /* We have to gather buffer addresses as a second step because the + * loop above puts data into the push constant area and the call to + * get_push_range_address is what locks our push constants and copies + * them into the actual GPU buffer. If we did the two loops at the + * same time, we'd risk only having some of the sizes in the push + * constant buffer when we did the copy. + */ + for (unsigned i = 0; i < 4; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + if (range->length == 0) + break; + + buffers[i] = get_push_range_address(cmd_buffer, stage, range); + max_push_range = MAX2(max_push_range, range->length); + buffer_count++; } + + /* We have at most 4 buffers but they should be tightly packed */ + for (unsigned i = buffer_count; i < 4; i++) + assert(bind_map->push_ranges[i].length == 0); } - flushed |= mesa_to_vk_shader_stage(stage); +#if GEN_GEN >= 12 + /* If this stage doesn't have any push constants, emit it later in a + * single CONSTANT_ALL packet. + */ + if (buffer_count == 0) { + nobuffer_stages |= 1 << stage; + continue; + } + + /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL + * contains only 5 bits, so we can only use it for buffers smaller than + * 32. + */ + if (max_push_range < 32) { + cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage, + buffers, buffer_count); + continue; + } +#endif + + cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count); } +#if GEN_GEN >= 12 + if (nobuffer_stages) + cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0); +#endif + cmd_buffer->state.push_constants_dirty &= ~flushed; } void genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) { - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; uint32_t *p; - uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; - if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) - vb_emit |= pipeline->vb_used; - assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); - genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); + + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); genX(flush_pipeline_select_3d)(cmd_buffer); + /* Apply any pending pipeline flushes we may have. We want to apply them + * now because, if any of those flushes are for things like push constants, + * the GPU will read the state at weird times. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) + vb_emit |= pipeline->vb_used; + if (vb_emit) { const uint32_t num_buffers = __builtin_popcount(vb_emit); const uint32_t num_dwords = 1 + num_buffers * 4; @@ -2608,6 +3259,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) #endif }; +#if GEN_GEN >= 8 && GEN_GEN <= 9 + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb, + state.BufferStartingAddress, + state.BufferSize); +#endif + GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); i++; } @@ -2615,13 +3272,41 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; - if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { - anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); - - /* The exact descriptor layout is pulled from the pipeline, so we need - * to re-emit binding tables on every pipeline change. +#if GEN_GEN >= 8 + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) { + /* We don't need any per-buffer dirty tracking because you're not + * allowed to bind different XFB buffers while XFB is enabled. */ - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; + for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { +#if GEN_GEN < 12 + sob.SOBufferIndex = idx; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx; +#endif + + if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { + sob.SOBufferEnable = true; + sob.MOCS = cmd_buffer->device->isl_dev.mocs.internal, + sob.StreamOffsetWriteEnable = false; + sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, + xfb->offset); + /* Size is in DWords - 1 */ + sob.SurfaceSize = xfb->size / 4 - 1; + } + } + } + + /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ + if (GEN_GEN >= 10) + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + } +#endif + + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { + anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); /* If the pipeline changed, we may need to re-allocate push constant * space in the URB. @@ -2646,7 +3331,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) pc.DepthStallEnable = true; pc.PostSyncOperation = WriteImmediateData; pc.Address = - (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; } } #endif @@ -2663,8 +3348,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect. */ uint32_t dirty = 0; - if (cmd_buffer->state.descriptors_dirty) - dirty = flush_descriptor_sets(cmd_buffer); + if (cmd_buffer->state.descriptors_dirty) { + dirty = flush_descriptor_sets(cmd_buffer, + &cmd_buffer->state.gfx.base, + pipeline->shaders, + ARRAY_SIZE(pipeline->shaders)); + } if (dirty || cmd_buffer->state.push_constants_dirty) { /* Because we're pushing UBOs, we have to push whenever either @@ -2692,8 +3381,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) gen7_cmd_buffer_emit_scissor(cmd_buffer); genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); - - genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } static void @@ -2709,7 +3396,8 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, .VertexBufferIndex = index, .AddressModifyEnable = true, .BufferPitch = 0, - .MOCS = anv_mocs_for_bo(cmd_buffer->device, addr.bo), + .MOCS = addr.bo ? anv_mocs_for_bo(cmd_buffer->device, addr.bo) : 0, + .NullVertexBuffer = size == 0, #if (GEN_GEN >= 8) .BufferStartingAddress = addr, .BufferSize = size @@ -2718,31 +3406,38 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, .EndAddress = anv_address_add(addr, size), #endif }); + + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, + index, addr, size); } static void emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr) { - emit_vertex_bo(cmd_buffer, addr, 8, ANV_SVGS_VB_INDEX); + emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX); } static void emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, uint32_t base_vertex, uint32_t base_instance) { - struct anv_state id_state = - anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); + if (base_vertex == 0 && base_instance == 0) { + emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS); + } else { + struct anv_state id_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); - ((uint32_t *)id_state.map)[0] = base_vertex; - ((uint32_t *)id_state.map)[1] = base_instance; + ((uint32_t *)id_state.map)[0] = base_vertex; + ((uint32_t *)id_state.map)[1] = base_instance; - struct anv_address addr = { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = id_state.offset, - }; + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = id_state.offset, + }; - emit_base_vertex_instance_bo(cmd_buffer, addr); + emit_base_vertex_instance_bo(cmd_buffer, addr); + } } static void @@ -2761,6 +3456,25 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); } +static void +update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type) +{ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + uint64_t vb_used = pipeline->vb_used; + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + vb_used |= 1ull << ANV_SVGS_VB_INDEX; + if (vs_prog_data->uses_drawid) + vb_used |= 1ull << ANV_DRAWID_VB_INDEX; + + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, + access_type == RANDOM, + vb_used); +} + void genX(CmdDraw)( VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -2769,7 +3483,7 @@ void genX(CmdDraw)( uint32_t firstInstance) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); if (anv_batch_has_error(&cmd_buffer->batch)) @@ -2777,18 +3491,28 @@ void genX(CmdDraw)( genX(cmd_buffer_flush_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + if (vs_prog_data->uses_firstvertex || vs_prog_data->uses_baseinstance) emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ - instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + if (!pipeline->use_primitive_replication) + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; prim.VertexCountPerInstance = vertexCount; @@ -2798,7 +3522,7 @@ void genX(CmdDraw)( prim.BaseVertexLocation = 0; } - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); } void genX(CmdDrawIndexed)( @@ -2810,7 +3534,7 @@ void genX(CmdDrawIndexed)( uint32_t firstInstance) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); if (anv_batch_has_error(&cmd_buffer->batch)) @@ -2818,18 +3542,28 @@ void genX(CmdDrawIndexed)( genX(cmd_buffer_flush_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + if (vs_prog_data->uses_firstvertex || vs_prog_data->uses_baseinstance) emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance); if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ - instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + if (!pipeline->use_primitive_replication) + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = RANDOM; prim.PrimitiveTopologyType = pipeline->topology; prim.VertexCountPerInstance = indexCount; @@ -2839,7 +3573,7 @@ void genX(CmdDrawIndexed)( prim.BaseVertexLocation = vertexOffset; } - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); } /* Auto-Draw / Indirect Registers */ @@ -2850,90 +3584,109 @@ void genX(CmdDrawIndexed)( #define GEN7_3DPRIM_START_INSTANCE 0x243C #define GEN7_3DPRIM_BASE_VERTEX 0x2440 -/* MI_MATH only exists on Haswell+ */ +void genX(CmdDrawIndirectByteCountEXT)( + VkCommandBuffer commandBuffer, + uint32_t instanceCount, + uint32_t firstInstance, + VkBuffer counterBuffer, + VkDeviceSize counterBufferOffset, + uint32_t counterOffset, + uint32_t vertexStride) +{ #if GEN_IS_HASWELL || GEN_GEN >= 8 + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); -/* Emit dwords to multiply GPR0 by N */ -static void -build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) -{ - VK_OUTARRAY_MAKE(out, dw, dw_count); + /* firstVertex is always zero for this draw function */ + const uint32_t firstVertex = 0; -#define append_alu(opcode, operand1, operand2) \ - vk_outarray_append(&out, alu_dw) *alu_dw = mi_alu(opcode, operand1, operand2) + if (anv_batch_has_error(&cmd_buffer->batch)) + return; - assert(N > 0); - unsigned top_bit = 31 - __builtin_clz(N); - for (int i = top_bit - 1; i >= 0; i--) { - /* We get our initial data in GPR0 and we write the final data out to - * GPR0 but we use GPR1 as our scratch register. - */ - unsigned src_reg = i == top_bit - 1 ? MI_ALU_REG0 : MI_ALU_REG1; - unsigned dst_reg = i == 0 ? MI_ALU_REG0 : MI_ALU_REG1; - - /* Shift the current value left by 1 */ - append_alu(MI_ALU_LOAD, MI_ALU_SRCA, src_reg); - append_alu(MI_ALU_LOAD, MI_ALU_SRCB, src_reg); - append_alu(MI_ALU_ADD, 0, 0); - - if (N & (1 << i)) { - /* Store ACCU to R1 and add R0 to R1 */ - append_alu(MI_ALU_STORE, MI_ALU_REG1, MI_ALU_ACCU); - append_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); - append_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1); - append_alu(MI_ALU_ADD, 0, 0); - } + genX(cmd_buffer_flush_state)(cmd_buffer); - append_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU); - } + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); -#undef append_alu -} + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); -static void -emit_mul_gpr0(struct anv_batch *batch, uint32_t N) -{ - uint32_t num_dwords; - build_alu_multiply_gpr0(NULL, &num_dwords, N); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + if (!pipeline->use_primitive_replication) + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + struct gen_mi_value count = + gen_mi_mem32(anv_address_add(counter_buffer->address, + counterBufferOffset)); + if (counterOffset) + count = gen_mi_isub(&b, count, gen_mi_imm(counterOffset)); + count = gen_mi_udiv32_imm(&b, count, vertexStride); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_VERTEX_COUNT), count); + + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_VERTEX), + gen_mi_imm(firstVertex)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_INSTANCE_COUNT), + gen_mi_imm(instanceCount)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_INSTANCE), + gen_mi_imm(firstInstance)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_BASE_VERTEX), gen_mi_imm(0)); - uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH)); - build_alu_multiply_gpr0(dw + 1, &num_dwords, N); -} + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = pipeline->topology; + } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); #endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ +} static void load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr, bool indexed) { - struct anv_batch *batch = &cmd_buffer->batch; + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); - emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, anv_address_add(addr, 0)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_VERTEX_COUNT), + gen_mi_mem32(anv_address_add(addr, 0))); + struct gen_mi_value instance_count = gen_mi_mem32(anv_address_add(addr, 4)); unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass); if (view_count > 1) { #if GEN_IS_HASWELL || GEN_GEN >= 8 - emit_lrm(batch, CS_GPR(0), anv_address_add(addr, 4)); - emit_mul_gpr0(batch, view_count); - emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); + instance_count = gen_mi_imul_imm(&b, instance_count, view_count); #else anv_finishme("Multiview + indirect draw requires MI_MATH; " "MI_MATH is not supported on Ivy Bridge"); - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, anv_address_add(addr, 4)); #endif - } else { - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, anv_address_add(addr, 4)); } + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_INSTANCE_COUNT), instance_count); - emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, anv_address_add(addr, 8)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_VERTEX), + gen_mi_mem32(anv_address_add(addr, 8))); if (indexed) { - emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, anv_address_add(addr, 12)); - emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, anv_address_add(addr, 16)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_BASE_VERTEX), + gen_mi_mem32(anv_address_add(addr, 12))); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_INSTANCE), + gen_mi_mem32(anv_address_add(addr, 16))); } else { - emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, anv_address_add(addr, 12)); - emit_lri(batch, GEN7_3DPRIM_BASE_VERTEX, 0); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_INSTANCE), + gen_mi_mem32(anv_address_add(addr, 12))); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_BASE_VERTEX), gen_mi_imm(0)); } } @@ -2946,7 +3699,7 @@ void genX(CmdDrawIndirect)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); if (anv_batch_has_error(&cmd_buffer->batch)) @@ -2954,6 +3707,9 @@ void genX(CmdDrawIndirect)( genX(cmd_buffer_flush_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + for (uint32_t i = 0; i < drawCount; i++) { struct anv_address draw = anv_address_add(buffer->address, offset); @@ -2963,18 +3719,24 @@ void genX(CmdDrawIndirect)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, false); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + offset += stride; } - - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; } void genX(CmdDrawIndexedIndirect)( @@ -2986,7 +3748,7 @@ void genX(CmdDrawIndexedIndirect)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); if (anv_batch_has_error(&cmd_buffer->batch)) @@ -2994,6 +3756,9 @@ void genX(CmdDrawIndexedIndirect)( genX(cmd_buffer_flush_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + for (uint32_t i = 0; i < drawCount; i++) { struct anv_address draw = anv_address_add(buffer->address, offset); @@ -3004,88 +3769,367 @@ void genX(CmdDrawIndexedIndirect)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = RANDOM; prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + offset += stride; } - - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; } -static VkResult -flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) +#define TMP_DRAW_COUNT_REG 0x2670 /* MI_ALU_REG14 */ + +static void +prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, + struct anv_address count_address, + const bool conditional_render_enabled) { - struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - struct anv_state surfaces = { 0, }, samplers = { 0, }; - VkResult result; + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); - result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); - if (result != VK_SUCCESS) { - assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); + if (conditional_render_enabled) { +#if GEN_GEN >= 8 || GEN_IS_HASWELL + gen_mi_store(&b, gen_mi_reg64(TMP_DRAW_COUNT_REG), + gen_mi_mem32(count_address)); +#endif + } else { + /* Upload the current draw count from the draw parameters buffer to + * MI_PREDICATE_SRC0. + */ + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + gen_mi_mem32(count_address)); - result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); - if (result != VK_SUCCESS) - return result; + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC1 + 4), gen_mi_imm(0)); + } +} - /* Re-emit state base addresses so we get the new surface state base - * address before we start emitting binding tables etc. +static void +emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, + uint32_t draw_index) +{ + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC1), gen_mi_imm(draw_index)); + + if (draw_index == 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } else { + /* While draw_index < draw_count the predicate's result will be + * (draw_index == draw_count) ^ TRUE = TRUE + * When draw_index == draw_count the result is + * (TRUE) ^ TRUE = FALSE + * After this all results will be: + * (FALSE) ^ FALSE = FALSE */ - genX(cmd_buffer_emit_state_base_address)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_XOR; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } +} - result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); - if (result != VK_SUCCESS) { - anv_batch_set_error(&cmd_buffer->batch, result); - return result; +#if GEN_GEN >= 8 || GEN_IS_HASWELL +static void +emit_draw_count_predicate_with_conditional_render( + struct anv_cmd_buffer *cmd_buffer, + uint32_t draw_index) +{ + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + struct gen_mi_value pred = gen_mi_ult(&b, gen_mi_imm(draw_index), + gen_mi_reg64(TMP_DRAW_COUNT_REG)); + pred = gen_mi_iand(&b, pred, gen_mi_reg64(ANV_PREDICATE_RESULT_REG)); + +#if GEN_GEN >= 8 + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_RESULT), pred); +#else + /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser + * so we emit MI_PREDICATE to set it. + */ + + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), pred); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +#endif +} +#endif + +void genX(CmdDrawIndirectCount)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkBuffer _countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + genX(cmd_buffer_flush_state)(cmd_buffer); + + struct anv_address count_address = + anv_address_add(count_buffer->address, countBufferOffset); + + prepare_for_draw_count_predicate(cmd_buffer, count_address, + cmd_state->conditional_render_enabled); + + for (uint32_t i = 0; i < maxDrawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (cmd_state->conditional_render_enabled) { + emit_draw_count_predicate_with_conditional_render(cmd_buffer, i); + } else { + emit_draw_count_predicate(cmd_buffer, i); + } +#else + emit_draw_count_predicate(cmd_buffer, i); +#endif + + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + load_indirect_parameters(cmd_buffer, draw, false); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = true; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = pipeline->topology; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + + offset += stride; } +} - result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); - if (result != VK_SUCCESS) { - anv_batch_set_error(&cmd_buffer->batch, result); - return result; +void genX(CmdDrawIndexedIndirectCount)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkBuffer _countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + genX(cmd_buffer_flush_state)(cmd_buffer); + + struct anv_address count_address = + anv_address_add(count_buffer->address, countBufferOffset); + + prepare_for_draw_count_predicate(cmd_buffer, count_address, + cmd_state->conditional_render_enabled); + + for (uint32_t i = 0; i < maxDrawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (cmd_state->conditional_render_enabled) { + emit_draw_count_predicate_with_conditional_render(cmd_buffer, i); + } else { + emit_draw_count_predicate(cmd_buffer, i); + } +#else + emit_draw_count_predicate(cmd_buffer, i); +#endif + + /* TODO: We need to stomp base vertex to 0 somehow */ + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + load_indirect_parameters(cmd_buffer, draw, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = true; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = pipeline->topology; + } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + + offset += stride; } +} - uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; - struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { - .BindingTablePointer = surfaces.offset, - .SamplerStatePointer = samplers.offset, - }; - GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); +void genX(CmdBeginTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - struct anv_state state = - anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, - pipeline->interface_descriptor_data, - GENX(INTERFACE_DESCRIPTOR_DATA_length), - 64); + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); - uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); - anv_batch_emit(&cmd_buffer->batch, - GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { - mid.InterfaceDescriptorTotalLength = size; - mid.InterfaceDescriptorDataStartAddress = state.offset; + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + idx >= firstCounterBuffer && + idx - firstCounterBuffer < counterBufferCount && + pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) { + uint32_t cb_idx = idx - firstCounterBuffer; + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lrm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + } + } else { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lri.DataDWord = 0; + } + } } - return VK_SUCCESS; + cmd_buffer->state.xfb_enabled = true; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + +void genX(CmdEndTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { + unsigned idx = firstCounterBuffer + cb_idx; + + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + cb_idx < counterBufferCount && + pCounterBuffers[cb_idx] != VK_NULL_HANDLE) { + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + } + } + } + + cmd_buffer->state.xfb_enabled = false; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; } void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) { - struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - MAYBE_UNUSED VkResult result; + struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; - assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); + assert(pipeline->cs); - genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); genX(flush_pipeline_select_gpgpu)(cmd_buffer); + /* Apply any pending pipeline flushes we may have. We want to apply them + * now because, if any of those flushes are for things like push constants, + * the GPU will read the state at weird times. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + if (cmd_buffer->state.compute.pipeline_dirty) { /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: * @@ -3098,17 +4142,41 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); - anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); + anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); + + /* The workgroup size of the pipeline affects our push constant layout + * so flag push constants as dirty if we change the pipeline. + */ + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || cmd_buffer->state.compute.pipeline_dirty) { - /* FIXME: figure out descriptors for gen7 */ - result = flush_compute_descriptor_set(cmd_buffer); - if (result != VK_SUCCESS) - return; + flush_descriptor_sets(cmd_buffer, + &cmd_buffer->state.compute.base, + &pipeline->cs, 1); + + uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; + struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { + .BindingTablePointer = + cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, + .SamplerStatePointer = + cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, + }; + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); + + struct anv_state state = + anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, + pipeline->interface_descriptor_data, + GENX(INTERFACE_DESCRIPTOR_DATA_length), + 64); - cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; + uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); + anv_batch_emit(&cmd_buffer->batch, + GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { + mid.InterfaceDescriptorTotalLength = size; + mid.InterfaceDescriptorDataStartAddress = state.offset; + } } if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { @@ -3137,8 +4205,8 @@ verify_cmd_parser(const struct anv_device *device, int required_version, const char *function) { - if (device->instance->physicalDevice.cmd_parser_version < required_version) { - return vk_errorf(device->instance, device->instance, + if (device->physical->cmd_parser_version < required_version) { + return vk_errorf(device, device->physical, VK_ERROR_FEATURE_NOT_PRESENT, "cmd parser version %d is required for %s", required_version, function); @@ -3158,22 +4226,14 @@ anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, if (anv_batch_has_error(&cmd_buffer->batch)) return; - VkResult result = - anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, MESA_SHADER_COMPUTE, - base_work_group_id); - if (result != VK_SUCCESS) { - cmd_buffer->batch.status = result; - return; - } - struct anv_push_constants *push = - cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; - if (push->base_work_group_id[0] != baseGroupX || - push->base_work_group_id[1] != baseGroupY || - push->base_work_group_id[2] != baseGroupZ) { - push->base_work_group_id[0] = baseGroupX; - push->base_work_group_id[1] = baseGroupY; - push->base_work_group_id[2] = baseGroupZ; + &cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; + if (push->cs.base_work_group_id[0] != baseGroupX || + push->cs.base_work_group_id[1] != baseGroupY || + push->cs.base_work_group_id[2] != baseGroupZ) { + push->cs.base_work_group_id[0] = baseGroupX; + push->cs.base_work_group_id[1] = baseGroupY; + push->cs.base_work_group_id[2] = baseGroupZ; cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } @@ -3198,7 +4258,7 @@ void genX(CmdDispatchBase)( uint32_t groupCountZ) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; + struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX, @@ -3218,15 +4278,22 @@ void genX(CmdDispatchBase)( .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, .offset = state.offset, }; + + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } genX(cmd_buffer_flush_compute_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) { + ggw.PredicateEnable = cmd_buffer->state.conditional_render_enabled; ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; + ggw.ThreadWidthCounterMaximum = anv_cs_threads(pipeline) - 1; ggw.ThreadGroupIDXDimension = groupCountX; ggw.ThreadGroupIDYDimension = groupCountY; ggw.ThreadGroupIDZDimension = groupCountZ; @@ -3248,7 +4315,7 @@ void genX(CmdDispatchIndirect)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); - struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; + struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); struct anv_address addr = anv_address_add(buffer->address, offset); struct anv_batch *batch = &cmd_buffer->batch; @@ -3264,45 +4331,46 @@ void genX(CmdDispatchIndirect)( return; #endif - if (prog_data->uses_num_work_groups) + if (prog_data->uses_num_work_groups) { cmd_buffer->state.compute.num_workgroups = addr; + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + } + genX(cmd_buffer_flush_compute_state)(cmd_buffer); - emit_lrm(batch, GPGPU_DISPATCHDIMX, anv_address_add(addr, 0)); - emit_lrm(batch, GPGPU_DISPATCHDIMY, anv_address_add(addr, 4)); - emit_lrm(batch, GPGPU_DISPATCHDIMZ, anv_address_add(addr, 8)); + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); -#if GEN_GEN <= 7 - /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ - emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0); - emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0); - emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0); + struct gen_mi_value size_x = gen_mi_mem32(anv_address_add(addr, 0)); + struct gen_mi_value size_y = gen_mi_mem32(anv_address_add(addr, 4)); + struct gen_mi_value size_z = gen_mi_mem32(anv_address_add(addr, 8)); - /* Load compute_dispatch_indirect_x_size into SRC0 */ - emit_lrm(batch, MI_PREDICATE_SRC0, anv_address_add(addr, 0)); + gen_mi_store(&b, gen_mi_reg32(GPGPU_DISPATCHDIMX), size_x); + gen_mi_store(&b, gen_mi_reg32(GPGPU_DISPATCHDIMY), size_y); + gen_mi_store(&b, gen_mi_reg32(GPGPU_DISPATCHDIMZ), size_z); +#if GEN_GEN <= 7 /* predicate = (compute_dispatch_indirect_x_size == 0); */ + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), size_x); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOAD; mip.CombineOperation = COMBINE_SET; mip.CompareOperation = COMPARE_SRCS_EQUAL; } - /* Load compute_dispatch_indirect_y_size into SRC0 */ - emit_lrm(batch, MI_PREDICATE_SRC0, anv_address_add(addr, 4)); - /* predicate |= (compute_dispatch_indirect_y_size == 0); */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC0), size_y); anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOAD; mip.CombineOperation = COMBINE_OR; mip.CompareOperation = COMPARE_SRCS_EQUAL; } - /* Load compute_dispatch_indirect_z_size into SRC0 */ - emit_lrm(batch, MI_PREDICATE_SRC0, anv_address_add(addr, 8)); - /* predicate |= (compute_dispatch_indirect_z_size == 0); */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC0), size_z); anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOAD; mip.CombineOperation = COMBINE_OR; @@ -3310,21 +4378,38 @@ void genX(CmdDispatchIndirect)( } /* predicate = !predicate; */ -#define COMPARE_FALSE 1 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOADINV; mip.CombineOperation = COMBINE_OR; mip.CompareOperation = COMPARE_FALSE; } + +#if GEN_IS_HASWELL + if (cmd_buffer->state.conditional_render_enabled) { + /* predicate &= !(conditional_rendering_predicate == 0); */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC0), + gen_mi_reg32(ANV_PREDICATE_RESULT_REG)); + anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_AND; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } +#endif + +#else /* GEN_GEN > 7 */ + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); #endif anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) { ggw.IndirectParameterEnable = true; - ggw.PredicateEnable = GEN_GEN <= 7; + ggw.PredicateEnable = GEN_GEN <= 7 || + cmd_buffer->state.conditional_render_enabled; ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; + ggw.ThreadWidthCounterMaximum = anv_cs_threads(pipeline) - 1; ggw.RightExecutionMask = pipeline->cs_right_mask; ggw.BottomExecutionMask = 0xffffffff; } @@ -3355,6 +4440,32 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); #endif +#if GEN_GEN == 9 + if (pipeline == _3D) { + /* There is a mid-object preemption workaround which requires you to + * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However, + * even without preemption, we have issues with geometry flickering when + * GPGPU and 3D are back-to-back and this seems to fix it. We don't + * really know why. + */ + const uint32_t subslices = + MAX2(cmd_buffer->device->physical->subslice_total, 1); + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) { + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * subslices - 1; + vfe.NumberofURBEntries = 2; + vfe.URBEntryAllocationSize = 2; + } + + /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is + * invalid. Set the compute pipeline to dirty to force a re-emit of the + * pipeline in case we get back-to-back dispatch calls with the same + * pipeline and a PIPELINE_SELECT in between. + */ + cmd_buffer->state.compute.pipeline_dirty = true; + } +#endif + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] * PIPELINE_SELECT [DevBWR+]": * @@ -3371,6 +4482,14 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, pc.DCFlushEnable = true; pc.PostSyncOperation = NoWrite; pc.CommandStreamerStallEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; + + /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be + * set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ + pc.DepthStallEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { @@ -3379,6 +4498,9 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, pc.StateCacheInvalidationEnable = true; pc.InstructionCacheInvalidateEnable = true; pc.PostSyncOperation = NoWrite; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { @@ -3445,45 +4567,254 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthCacheFlushEnable = true; +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthStallEnable = true; } } -static void -cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) +/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: + * + * "The VF cache needs to be invalidated before binding and then using + * Vertex Buffers that overlap with any previously bound Vertex Buffer + * (at a 64B granularity) since the last invalidation. A VF cache + * invalidate is performed by setting the "VF Cache Invalidation Enable" + * bit in PIPE_CONTROL." + * + * This is implemented by carefully tracking all vertex and index buffer + * bindings and flushing if the cache ever ends up with a range in the cache + * that would exceed 4 GiB. This is implemented in three parts: + * + * 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called + * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the + * tracking code of the new binding. If this new binding would cause + * the cache to have a too-large range on the next draw call, a pipeline + * stall and VF cache invalidate are added to pending_pipeline_bits. + * + * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to + * empty whenever we emit a VF invalidate. + * + * 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called + * after every 3DPRIMITIVE and copies the bound range into the dirty + * range for each used buffer. This has to be a separate step because + * we don't always re-bind all buffers and so 1. can't know which + * buffers are actually bound. + */ +void +genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size) { - struct anv_device *device = cmd_buffer->device; - const struct anv_image_view *iview = - anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); - const struct anv_image *image = iview ? iview->image : NULL; - - /* FIXME: Width and Height are wrong */ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->physical->use_softpin) + return; - genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); + struct anv_vb_cache_range *bound, *dirty; + if (vb_index == -1) { + bound = &cmd_buffer->state.gfx.ib_bound_range; + dirty = &cmd_buffer->state.gfx.ib_dirty_range; + } else { + assert(vb_index >= 0); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index]; + } - uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, - device->isl_dev.ds.size / 4); - if (dw == NULL) + if (vb_size == 0) { + bound->start = 0; + bound->end = 0; return; + } - struct isl_depth_stencil_hiz_emit_info info = { }; + assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED)); + bound->start = gen_48b_address(anv_address_physical(vb_address)); + bound->end = bound->start + vb_size; + assert(bound->end > bound->start); /* No overflow */ - if (iview) - info.view = &iview->planes[0].isl; + /* Align everything to a cache line */ + bound->start &= ~(64ull - 1ull); + bound->end = align_u64(bound->end, 64); - if (image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { - uint32_t depth_plane = - anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT); - const struct anv_surface *surface = &image->planes[depth_plane].surface; + /* Compute the dirty range */ + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); - info.depth_surf = &surface->isl; + /* If our range is larger than 32 bits, we have to flush */ + assert(bound->end - bound->start <= (1ull << 32)); + if (dirty->end - dirty->start > (1ull << 32)) { + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } +} - info.depth_address = - anv_batch_emit_reloc(&cmd_buffer->batch, - dw + device->isl_dev.ds.depth_offset / 4, - image->planes[depth_plane].address.bo, +void +genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used) +{ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->physical->use_softpin) + return; + + if (access_type == RANDOM) { + /* We have an index buffer */ + struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range; + struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } + + uint64_t mask = vb_used; + while (mask) { + int i = u_bit_scan64(&mask); + assert(i >= 0); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + + struct anv_vb_cache_range *bound, *dirty; + bound = &cmd_buffer->state.gfx.vb_bound_ranges[i]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i]; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } +} + +/** + * Update the pixel hashing modes that determine the balancing of PS threads + * across subslices and slices. + * + * \param width Width bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param height Height bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param scale The number of framebuffer samples that could potentially be + * affected by an individual channel of the PS thread. This is + * typically one for single-sampled rendering, but for operations + * like CCS resolves and fast clears a single PS invocation may + * update a huge number of pixels, in which case a finer + * balancing is desirable in order to maximally utilize the + * bandwidth available. UINT_MAX can be used as shorthand for + * "finest hashing mode available". + */ +void +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, + unsigned width, unsigned height, + unsigned scale) +{ +#if GEN_GEN == 9 + const struct gen_device_info *devinfo = &cmd_buffer->device->info; + const unsigned slice_hashing[] = { + /* Because all Gen9 platforms with more than one slice require + * three-way subslice hashing, a single "normal" 16x16 slice hashing + * block is guaranteed to suffer from substantial imbalance, with one + * subslice receiving twice as much work as the other two in the + * slice. + * + * The performance impact of that would be particularly severe when + * three-way hashing is also in use for slice balancing (which is the + * case for all Gen9 GT4 platforms), because one of the slices + * receives one every three 16x16 blocks in either direction, which + * is roughly the periodicity of the underlying subslice imbalance + * pattern ("roughly" because in reality the hardware's + * implementation of three-way hashing doesn't do exact modulo 3 + * arithmetic, which somewhat decreases the magnitude of this effect + * in practice). This leads to a systematic subslice imbalance + * within that slice regardless of the size of the primitive. The + * 32x32 hashing mode guarantees that the subslice imbalance within a + * single slice hashing block is minimal, largely eliminating this + * effect. + */ + _32x32, + /* Finest slice hashing mode available. */ + NORMAL + }; + const unsigned subslice_hashing[] = { + /* 16x16 would provide a slight cache locality benefit especially + * visible in the sampler L1 cache efficiency of low-bandwidth + * non-LLC platforms, but it comes at the cost of greater subslice + * imbalance for primitives of dimensions approximately intermediate + * between 16x4 and 16x16. + */ + _16x4, + /* Finest subslice hashing mode available. */ + _8x4 + }; + /* Dimensions of the smallest hashing block of a given hashing mode. If + * the rendering area is smaller than this there can't possibly be any + * benefit from switching to this mode, so we optimize out the + * transition. + */ + const unsigned min_size[][2] = { + { 16, 4 }, + { 8, 4 } + }; + const unsigned idx = scale > 1; + + if (cmd_buffer->state.current_hash_scale != scale && + (width > min_size[idx][0] || height > min_size[idx][1])) { + uint32_t gt_mode; + + anv_pack_struct(>_mode, GENX(GT_MODE), + .SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0), + .SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0), + .SubsliceHashing = subslice_hashing[idx], + .SubsliceHashingMask = -1); + + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode); + + cmd_buffer->state.current_hash_scale = scale; + } +#endif +} + +static void +cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_device *device = cmd_buffer->device; + const struct anv_image_view *iview = + anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); + const struct anv_image *image = iview ? iview->image : NULL; + + /* FIXME: Width and Height are wrong */ + + genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); + + uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, + device->isl_dev.ds.size / 4); + if (dw == NULL) + return; + + struct isl_depth_stencil_hiz_emit_info info = { }; + + if (iview) + info.view = &iview->planes[0].isl; + + if (image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { + uint32_t depth_plane = + anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT); + const struct anv_surface *surface = &image->planes[depth_plane].surface; + + info.depth_surf = &surface->isl; + + info.depth_address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.ds.depth_offset / 4, + image->planes[depth_plane].address.bo, image->planes[depth_plane].address.offset + surface->offset); info.mocs = @@ -3492,7 +4823,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment->attachment; info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage; - if (info.hiz_usage == ISL_AUX_USAGE_HIZ) { + if (info.hiz_usage != ISL_AUX_USAGE_NONE) { + assert(isl_aux_usage_has_hiz(info.hiz_usage)); info.hiz_surf = &image->planes[depth_plane].aux_surface.isl; info.hiz_address = @@ -3525,7 +4857,24 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); - cmd_buffer->state.hiz_enabled = info.hiz_usage == ISL_AUX_USAGE_HIZ; + if (GEN_GEN >= 12) { + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* GEN:BUG:1408224581 + * + * Workaround: Gen12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.PostSyncOperation = WriteImmediateData; + pc.Address = + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; + } + } + cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage); } /** @@ -3569,7 +4918,8 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, uint32_t subpass_id) { struct anv_cmd_state *cmd_state = &cmd_buffer->state; - struct anv_subpass *subpass = &cmd_state->pass->subpasses[subpass_id]; + struct anv_render_pass *pass = cmd_state->pass; + struct anv_subpass *subpass = &pass->subpasses[subpass_id]; cmd_state->subpass = subpass; cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; @@ -3612,54 +4962,56 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, assert(a < cmd_state->pass->attachment_count); struct anv_attachment_state *att_state = &cmd_state->attachments[a]; - struct anv_image_view *iview = fb->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view; const struct anv_image *image = iview->image; - /* A resolve is necessary before use as an input attachment if the clear - * color or auxiliary buffer usage isn't supported by the sampler. - */ - const bool input_needs_resolve = - (att_state->fast_clear && !att_state->clear_color_is_zero_one) || - att_state->input_aux_usage != att_state->aux_usage; - - VkImageLayout target_layout; - if (iview->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV && - !input_needs_resolve) { - /* Layout transitions before the final only help to enable sampling - * as an input attachment. If the input attachment supports sampling - * using the auxiliary surface, we can skip such transitions by - * making the target layout one that is CCS-aware. - */ - target_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + VkImageLayout target_layout = subpass->attachments[i].layout; + VkImageLayout target_stencil_layout = + subpass->attachments[i].stencil_layout; + + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); } else { - target_layout = subpass->attachments[i].layout; + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; } if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(iview->image->extent.depth, - iview->planes[0].isl.base_level); - } else { - base_layer = iview->planes[0].isl.base_array_layer; - layer_count = fb->layers; - } - transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, iview->planes[0].isl.base_level, 1, base_layer, layer_count, att_state->current_layout, target_layout); - } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + att_state->aux_usage = + anv_layout_to_aux_usage(&cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + target_layout); + } + + if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, + base_layer, layer_count, att_state->current_layout, target_layout); att_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, target_layout); + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + target_layout); + } + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_stencil_layout, + target_stencil_layout); } att_state->current_layout = target_layout; + att_state->current_stencil_layout = target_stencil_layout; if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); @@ -3682,6 +5034,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, if (iview->image->samples == 1) { anv_image_ccs_op(cmd_buffer, image, iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, &clear_color, @@ -3689,6 +5042,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, } else { anv_image_mcs_op(cmd_buffer, image, iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, ISL_AUX_OP_FAST_CLEAR, &clear_color, @@ -3699,7 +5053,8 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, if (is_multiview) att_state->pending_clear_views &= ~1; - if (att_state->clear_color_is_zero) { + if (isl_color_value_is_zero(clear_color, + iview->planes[0].isl.format)) { /* This image has the auxiliary buffer enabled. We can mark the * subresource as not needing a resolve because the clear color * will match what's in every RENDER_SURFACE_STATE object when @@ -3761,12 +5116,10 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (att_state->fast_clear && !is_multiview) { - /* We currently only support HiZ for single-layer images */ + /* We currently only support HiZ for single-LOD images */ if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { - assert(iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ); + assert(isl_aux_usage_has_hiz(iview->image->planes[0].aux_usage)); assert(iview->planes[0].isl.base_level == 0); - assert(iview->planes[0].isl.base_array_layer == 0); - assert(fb->layers == 1); } anv_image_hiz_clear(cmd_buffer, image, @@ -3809,29 +5162,184 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, assert(att_state->pending_clear_aspects == 0); } + /* If multiview is enabled, then we are only done clearing when we no + * longer have pending layers to clear, or when we have processed the + * last subpass that uses this attachment. + */ + if (!is_multiview || + att_state->pending_clear_views == 0 || + current_subpass_is_last_for_attachment(cmd_state, a)) { + att_state->pending_clear_aspects = 0; + } + + att_state->pending_load_aspects = 0; + } + + /* We've transitioned all our images possibly fast clearing them. Now we + * can fill out the surface states that we will use as render targets + * during actual subpass rendering. + */ + VkResult result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, + pass, subpass); + if (result != VK_SUCCESS) + return; + + isl_null_fill_state(&cmd_buffer->device->isl_dev, + cmd_state->null_surface_state.map, + isl_extent3d(fb->width, fb->height, fb->layers)); + + for (uint32_t i = 0; i < subpass->attachment_count; ++i) { + const uint32_t att = subpass->attachments[i].attachment; + if (att == VK_ATTACHMENT_UNUSED) + continue; + + assert(att < cmd_state->pass->attachment_count); + struct anv_render_pass_attachment *pass_att = &pass->attachments[att]; + struct anv_attachment_state *att_state = &cmd_state->attachments[att]; + struct anv_image_view *iview = att_state->image_view; + + if (!vk_format_is_color(pass_att->format)) + continue; + + const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; + assert(util_bitcount(att_usage) == 1); + + struct anv_surface_state *surface_state; + isl_surf_usage_flags_t isl_surf_usage; + enum isl_aux_usage isl_aux_usage; + if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + surface_state = &att_state->color; + isl_surf_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; + isl_aux_usage = att_state->aux_usage; + } else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { + surface_state = &att_state->input; + isl_surf_usage = ISL_SURF_USAGE_TEXTURE_BIT; + isl_aux_usage = + anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, + att_state->current_layout); + } else { + continue; + } + + /* We had better have a surface state when we get here */ + assert(surface_state->state.map); + + union isl_color_value clear_color = { .u32 = { 0, } }; + if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR && + att_state->fast_clear) + anv_clear_color_from_att_state(&clear_color, att_state, iview); + + anv_image_fill_surface_state(cmd_buffer->device, + iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + &iview->planes[0].isl, + isl_surf_usage, + isl_aux_usage, + &clear_color, + 0, + surface_state, + NULL); + + add_surface_state_relocs(cmd_buffer, *surface_state); + if (GEN_GEN < 10 && - (att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && - image->planes[0].aux_surface.isl.size_B > 0 && + pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD && + iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE && iview->planes[0].isl.base_level == 0 && iview->planes[0].isl.base_array_layer == 0) { - if (att_state->aux_usage != ISL_AUX_USAGE_NONE) { - genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color.state, - image, VK_IMAGE_ASPECT_COLOR_BIT, - false /* copy to ss */); - } - - if (need_input_attachment_state(&cmd_state->pass->attachments[a]) && - att_state->input_aux_usage != ISL_AUX_USAGE_NONE) { - genX(copy_fast_clear_dwords)(cmd_buffer, att_state->input.state, - image, VK_IMAGE_ASPECT_COLOR_BIT, - false /* copy to ss */); - } + genX(copy_fast_clear_dwords)(cmd_buffer, surface_state->state, + iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + false /* copy to ss */); } + } + +#if GEN_GEN >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Taget Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT; +#endif +#if GEN_GEN == 12 + /* GEN:BUG:14010455700 + * + * ISL will change some CHICKEN registers depending on the depth surface + * format, along with emitting the depth and stencil packets. In that case, + * we want to do a depth flush and stall, so the pipeline is not using these + * settings while we change the registers. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +#endif + + cmd_buffer_emit_depth_stencil(cmd_buffer); +} + +static enum blorp_filter +vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode) +{ + switch (vk_mode) { + case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR: + return BLORP_FILTER_SAMPLE_0; + case VK_RESOLVE_MODE_AVERAGE_BIT_KHR: + return BLORP_FILTER_AVERAGE; + case VK_RESOLVE_MODE_MIN_BIT_KHR: + return BLORP_FILTER_MIN_SAMPLE; + case VK_RESOLVE_MODE_MAX_BIT_KHR: + return BLORP_FILTER_MAX_SAMPLE; + default: + return BLORP_FILTER_NONE; + } +} + +static void +cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_subpass *subpass = cmd_state->subpass; + uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); + struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; + + /* We are done with the previous subpass and all rendering directly to that + * subpass is now complete. Zero out all the surface states so we don't + * accidentally use them between now and the next subpass. + */ + for (uint32_t i = 0; i < cmd_state->pass->attachment_count; ++i) { + memset(&cmd_state->attachments[i].color, 0, + sizeof(cmd_state->attachments[i].color)); + memset(&cmd_state->attachments[i].input, 0, + sizeof(cmd_state->attachments[i].input)); + } + cmd_state->null_surface_state = ANV_STATE_NULL; + cmd_state->attachment_states = ANV_STATE_NULL; + + for (uint32_t i = 0; i < subpass->attachment_count; ++i) { + const uint32_t a = subpass->attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + assert(a < cmd_state->pass->attachment_count); + struct anv_attachment_state *att_state = &cmd_state->attachments[a]; + struct anv_image_view *iview = att_state->image_view; + + assert(util_bitcount(subpass->attachments[i].usage) == 1); if (subpass->attachments[i].usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { - /* We assume that if we're starting a subpass, we're going to do some - * rendering so we may end up with compressed data. + /* We assume that if we're ending a subpass, we did do some rendering + * so we may end up with compressed data. */ genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, VK_IMAGE_ASPECT_COLOR_BIT, @@ -3850,7 +5358,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, */ const struct isl_view *ds_view = &iview->planes[0].isl; if (iview->aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { - genX(cmd_buffer_mark_image_written)(cmd_buffer, image, + genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, att_state->aux_usage, ds_view->base_level, @@ -3861,7 +5369,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, /* Even though stencil may be plane 1, it always shares a * base_level with depth. */ - genX(cmd_buffer_mark_image_written)(cmd_buffer, image, + genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, VK_IMAGE_ASPECT_STENCIL_BIT, ISL_AUX_USAGE_NONE, ds_view->base_level, @@ -3869,47 +5377,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, fb->layers); } } - - /* If multiview is enabled, then we are only done clearing when we no - * longer have pending layers to clear, or when we have processed the - * last subpass that uses this attachment. - */ - if (!is_multiview || - att_state->pending_clear_views == 0 || - current_subpass_is_last_for_attachment(cmd_state, a)) { - att_state->pending_clear_aspects = 0; - } - - att_state->pending_load_aspects = 0; - } - - cmd_buffer_emit_depth_stencil(cmd_buffer); -} - -static enum blorp_filter -vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode) -{ - switch (vk_mode) { - case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR: - return BLORP_FILTER_SAMPLE_0; - case VK_RESOLVE_MODE_AVERAGE_BIT_KHR: - return BLORP_FILTER_AVERAGE; - case VK_RESOLVE_MODE_MIN_BIT_KHR: - return BLORP_FILTER_MIN_SAMPLE; - case VK_RESOLVE_MODE_MAX_BIT_KHR: - return BLORP_FILTER_MAX_SAMPLE; - default: - return BLORP_FILTER_NONE; } -} - -static void -cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) -{ - struct anv_cmd_state *cmd_state = &cmd_buffer->state; - struct anv_subpass *subpass = cmd_state->subpass; - uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); - struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; if (subpass->has_color_resolve) { /* We are about to do some MSAA resolves. We need to flush so that the @@ -3941,8 +5409,8 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; } - struct anv_image_view *src_iview = fb->attachments[src_att]; - struct anv_image_view *dst_iview = fb->attachments[dst_att]; + struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; + struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; const VkRect2D render_area = cmd_buffer->state.render_area; @@ -3996,31 +5464,34 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; } - struct anv_image_view *src_iview = fb->attachments[src_att]; - struct anv_image_view *dst_iview = fb->attachments[dst_att]; + struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; + struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; const VkRect2D render_area = cmd_buffer->state.render_area; + struct anv_attachment_state *src_state = + &cmd_state->attachments[src_att]; + struct anv_attachment_state *dst_state = + &cmd_state->attachments[dst_att]; + if ((src_iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { - struct anv_attachment_state *src_state = - &cmd_state->attachments[src_att]; - struct anv_attachment_state *dst_state = - &cmd_state->attachments[dst_att]; - /* MSAA resolves sample from the source attachment. Transition the * depth attachment first to get rid of any HiZ that we may not be * able to handle. */ transition_depth_buffer(cmd_buffer, src_iview->image, + src_iview->planes[0].isl.base_array_layer, + fb->layers, src_state->current_layout, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); src_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - src_state->current_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; /* MSAA resolves write to the resolve attachment as if it were any * other transfer op. Transition the resolve attachment accordingly. @@ -4038,11 +5509,14 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED; transition_depth_buffer(cmd_buffer, dst_iview->image, + dst_iview->planes[0].isl.base_array_layer, + fb->layers, dst_initial_layout, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); dst_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; @@ -4067,6 +5541,9 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) if ((src_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { + src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE; enum isl_aux_usage dst_aux_usage = ISL_AUX_USAGE_NONE; @@ -4089,6 +5566,48 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) } } +#if GEN_GEN == 7 + /* On gen7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in following + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. + */ + if (subpass->depth_stencil_attachment) { + uint32_t a = subpass->depth_stencil_attachment->attachment; + assert(a != VK_ATTACHMENT_UNUSED); + + struct anv_attachment_state *att_state = &cmd_state->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view;; + const struct anv_image *image = iview->image; + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + uint32_t plane = anv_image_aspect_to_plane(image->aspects, + VK_IMAGE_ASPECT_STENCIL_BIT); + + if (image->planes[plane].shadow_surface.isl.size_B > 0 && + att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) { + assert(image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + iview->planes[plane].isl.base_level, 1, + iview->planes[plane].isl.base_array_layer, + fb->layers); + } + } + } +#endif /* GEN_GEN == 7 */ + for (uint32_t i = 0; i < subpass->attachment_count; ++i) { const uint32_t a = subpass->attachments[i].attachment; if (a == VK_ATTACHMENT_UNUSED) @@ -4099,83 +5618,46 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) assert(a < cmd_state->pass->attachment_count); struct anv_attachment_state *att_state = &cmd_state->attachments[a]; - struct anv_image_view *iview = fb->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view; const struct anv_image *image = iview->image; - if ((image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && - image->vk_format != iview->vk_format) { - enum anv_fast_clear_type fast_clear_type = - anv_layout_to_fast_clear_type(&cmd_buffer->device->info, - image, VK_IMAGE_ASPECT_COLOR_BIT, - att_state->current_layout); - - /* If any clear color was used, flush it down the aux surfaces. If we - * don't do it now using the view's format we might use the clear - * color incorrectly in the following resolves (for example with an - * SRGB view & a UNORM image). - */ - if (fast_clear_type != ANV_FAST_CLEAR_NONE) { - anv_perf_warn(cmd_buffer->device->instance, fb, - "Doing a partial resolve to get rid of clear color at the " - "end of a renderpass due to an image/view format mismatch"); - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(iview->image->extent.depth, - iview->planes[0].isl.base_level); - } else { - base_layer = iview->planes[0].isl.base_array_layer; - layer_count = fb->layers; - } - - for (uint32_t a = 0; a < layer_count; a++) { - uint32_t array_layer = base_layer + a; - if (image->samples == 1) { - anv_cmd_predicated_ccs_resolve(cmd_buffer, image, - iview->planes[0].isl.format, - VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level, - array_layer, - ISL_AUX_OP_PARTIAL_RESOLVE, - ANV_FAST_CLEAR_NONE); - } else { - anv_cmd_predicated_mcs_resolve(cmd_buffer, image, - iview->planes[0].isl.format, - VK_IMAGE_ASPECT_COLOR_BIT, - base_layer, - ISL_AUX_OP_PARTIAL_RESOLVE, - ANV_FAST_CLEAR_NONE); - } - } - } - } - /* Transition the image into the final layout for this render pass */ VkImageLayout target_layout = cmd_state->pass->attachments[a].final_layout; + VkImageLayout target_stencil_layout = + cmd_state->pass->attachments[a].stencil_final_layout; + + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(iview->image->extent.depth, - iview->planes[0].isl.base_level); - } else { - base_layer = iview->planes[0].isl.base_array_layer; - layer_count = fb->layers; - } - transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, iview->planes[0].isl.base_level, 1, base_layer, layer_count, att_state->current_layout, target_layout); - } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + } + + if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, + base_layer, layer_count, att_state->current_layout, target_layout); } + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_stencil_layout, + target_stencil_layout); + } } /* Accumulate any subpass flushes that need to happen after the subpass. @@ -4195,14 +5677,15 @@ void genX(CmdBeginRenderPass)( ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass); ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); + VkResult result; cmd_buffer->state.framebuffer = framebuffer; cmd_buffer->state.pass = pass; cmd_buffer->state.render_area = pRenderPassBegin->renderArea; - VkResult result = - genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, pRenderPassBegin); - /* If we failed to setup the attachments we should not try to go further */ + result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, + framebuffer, + pRenderPassBegin); if (result != VK_SUCCESS) { assert(anv_batch_has_error(&cmd_buffer->batch)); return; @@ -4213,7 +5696,7 @@ void genX(CmdBeginRenderPass)( cmd_buffer_begin_subpass(cmd_buffer, 0); } -void genX(CmdBeginRenderPass2KHR)( +void genX(CmdBeginRenderPass2)( VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBeginInfo, const VkSubpassBeginInfoKHR* pSubpassBeginInfo) @@ -4238,7 +5721,7 @@ void genX(CmdNextSubpass)( cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); } -void genX(CmdNextSubpass2KHR)( +void genX(CmdNextSubpass2)( VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo) @@ -4259,7 +5742,7 @@ void genX(CmdEndRenderPass)( cmd_buffer->state.hiz_enabled = false; #ifndef NDEBUG - anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); + anv_dump_add_attachments(cmd_buffer); #endif /* Remove references to render pass specific state. This enables us to @@ -4270,9 +5753,248 @@ void genX(CmdEndRenderPass)( cmd_buffer->state.subpass = NULL; } -void genX(CmdEndRenderPass2KHR)( +void genX(CmdEndRenderPass2)( VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo) { genX(CmdEndRenderPass)(commandBuffer); } + +void +genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer) +{ +#if GEN_GEN >= 8 || GEN_IS_HASWELL + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + gen_mi_reg32(ANV_PREDICATE_RESULT_REG)); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +#endif +} + +#if GEN_GEN >= 8 || GEN_IS_HASWELL +void genX(CmdBeginConditionalRenderingEXT)( + VkCommandBuffer commandBuffer, + const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_address value_address = + anv_address_add(buffer->address, pConditionalRenderingBegin->offset); + + const bool isInverted = pConditionalRenderingBegin->flags & + VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + + cmd_state->conditional_render_enabled = true; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + /* Section 19.4 of the Vulkan 1.1.85 spec says: + * + * If the value of the predicate in buffer memory changes + * while conditional rendering is active, the rendering commands + * may be discarded in an implementation-dependent way. + * Some implementations may latch the value of the predicate + * upon beginning conditional rendering while others + * may read it before every rendering command. + * + * So it's perfectly fine to read a value from the buffer once. + */ + struct gen_mi_value value = gen_mi_mem32(value_address); + + /* Precompute predicate result, it is necessary to support secondary + * command buffers since it is unknown if conditional rendering is + * inverted when populating them. + */ + gen_mi_store(&b, gen_mi_reg64(ANV_PREDICATE_RESULT_REG), + isInverted ? gen_mi_uge(&b, gen_mi_imm(0), value) : + gen_mi_ult(&b, gen_mi_imm(0), value)); +} + +void genX(CmdEndConditionalRenderingEXT)( + VkCommandBuffer commandBuffer) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + + cmd_state->conditional_render_enabled = false; +} +#endif + +/* Set of stage bits for which are pipelined, i.e. they get queued by the + * command streamer for later execution. + */ +#define ANV_PIPELINE_STAGE_PIPELINED_BITS \ + (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | \ + VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | \ + VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | \ + VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | \ + VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | \ + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | \ + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | \ + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | \ + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | \ + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | \ + VK_PIPELINE_STAGE_TRANSFER_BIT | \ + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | \ + VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | \ + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT) + +void genX(CmdSetEvent)( + VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT, + pc.PostSyncOperation = WriteImmediateData, + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_SET; + } +} + +void genX(CmdResetEvent)( + VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_RESET; + } +} + +void genX(CmdWaitEvents)( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags destStageMask, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ +#if GEN_GEN >= 8 + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + for (uint32_t i = 0; i < eventCount; i++) { + ANV_FROM_HANDLE(anv_event, event, pEvents[i]); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) { + sem.WaitMode = PollingMode, + sem.CompareOperation = COMPARE_SAD_EQUAL_SDD, + sem.SemaphoreDataDword = VK_EVENT_SET, + sem.SemaphoreAddress = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + } + } +#else + anv_finishme("Implement events on gen7"); +#endif + + genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask, + false, /* byRegion */ + memoryBarrierCount, pMemoryBarriers, + bufferMemoryBarrierCount, pBufferMemoryBarriers, + imageMemoryBarrierCount, pImageMemoryBarriers); +} + +VkResult genX(CmdSetPerformanceOverrideINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceOverrideInfoINTEL* pOverrideInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + switch (pOverrideInfo->type) { + case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { + uint32_t dw; + +#if GEN_GEN >= 9 + anv_pack_struct(&dw, GENX(CS_DEBUG_MODE2), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2_num), dw); +#else + anv_pack_struct(&dw, GENX(INSTPM), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(INSTPM_num), dw); +#endif + break; + } + + case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: + if (pOverrideInfo->enable) { + /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_FLUSH_BITS | + ANV_PIPE_INVALIDATE_BITS; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + break; + + default: + unreachable("Invalid override"); + } + + return VK_SUCCESS; +} + +VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) +{ + /* TODO: Waiting on the register to write, might depend on generation. */ + + return VK_SUCCESS; +}