X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2FgenX_cmd_buffer.c;h=74a052d4807b5796aa244d3fa53837720e3816c1;hb=dab7a4d82c1a490c93da3c19d34097732ae8a15c;hp=ec72010cb860c17fee515afe03ef511065de3fb7;hpb=48da45891e3844d523586b95164419ee70e653b4;p=mesa.git diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index ec72010cb86..74a052d4807 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -29,6 +29,7 @@ #include "vk_util.h" #include "util/fast_idiv_by_const.h" +#include "common/gen_aux_map.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" @@ -39,6 +40,9 @@ #define __gen_address_offset anv_address_add #include "common/gen_mi_builder.h" +static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, + uint32_t pipeline); + static void emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) { @@ -52,6 +56,8 @@ void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; + UNUSED const struct gen_device_info *devinfo = &device->info; + uint32_t mocs = device->isl_dev.mocs.internal; /* If we are emitting a new state base address we probably need to re-emit * binding tables. @@ -69,30 +75,56 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) pc.DCFlushEnable = true; pc.RenderTargetCacheFlushEnable = true; pc.CommandStreamerStallEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; +#endif +#if GEN_GEN == 12 + /* GEN:BUG:1606662791: + * + * Software must program PIPE_CONTROL command with "HDC Pipeline + * Flush" prior to programming of the below two non-pipeline state : + * * STATE_BASE_ADDRESS + * * 3DSTATE_BINDING_TABLE_POOL_ALLOC + */ + if (devinfo->revision == 0 /* A0 */) + pc.HDCPipelineFlushEnable = true; +#endif } +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline + * mode by putting the pipeline temporarily in 3D mode. + */ + uint32_t gen12_wa_pipeline = cmd_buffer->state.current_pipeline; + genX(flush_pipeline_select_3d)(cmd_buffer); +#endif + anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.GeneralStateMOCS = GENX(MOCS); + sba.GeneralStateMOCS = mocs; sba.GeneralStateBaseAddressModifyEnable = true; + sba.StatelessDataPortAccessMOCS = mocs; + sba.SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer); - sba.SurfaceStateMOCS = GENX(MOCS); + sba.SurfaceStateMOCS = mocs; sba.SurfaceStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddress = (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 }; - sba.DynamicStateMOCS = GENX(MOCS); + sba.DynamicStateMOCS = mocs; sba.DynamicStateBaseAddressModifyEnable = true; sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; - sba.IndirectObjectMOCS = GENX(MOCS); + sba.IndirectObjectMOCS = mocs; sba.IndirectObjectBaseAddressModifyEnable = true; sba.InstructionBaseAddress = (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 }; - sba.InstructionMOCS = GENX(MOCS); + sba.InstructionMOCS = mocs; sba.InstructionBaseAddressModifyEnable = true; # if (GEN_GEN >= 8) @@ -100,29 +132,71 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) * these fields. However, since we will be growing the BO's live, we * just set them all to the maximum. */ - sba.GeneralStateBufferSize = 0xfffff; + sba.GeneralStateBufferSize = 0xfffff; + sba.IndirectObjectBufferSize = 0xfffff; + if (device->physical->use_softpin) { + /* With softpin, we use fixed addresses so we actually know how big + * our base addresses are. + */ + sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096; + sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096; + } else { + sba.DynamicStateBufferSize = 0xfffff; + sba.InstructionBufferSize = 0xfffff; + } sba.GeneralStateBufferSizeModifyEnable = true; - sba.DynamicStateBufferSize = 0xfffff; - sba.DynamicStateBufferSizeModifyEnable = true; - sba.IndirectObjectBufferSize = 0xfffff; sba.IndirectObjectBufferSizeModifyEnable = true; - sba.InstructionBufferSize = 0xfffff; + sba.DynamicStateBufferSizeModifyEnable = true; sba.InstructionBuffersizeModifyEnable = true; +# else + /* On gen7, we have upper bounds instead. According to the docs, + * setting an upper bound of zero means that no bounds checking is + * performed so, in theory, we should be able to leave them zero. + * However, border color is broken and the GPU bounds-checks anyway. + * To avoid this and other potential problems, we may as well set it + * for everything. + */ + sba.GeneralStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.GeneralStateAccessUpperBoundModifyEnable = true; + sba.DynamicStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.DynamicStateAccessUpperBoundModifyEnable = true; + sba.InstructionAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.InstructionAccessUpperBoundModifyEnable = true; # endif # if (GEN_GEN >= 9) - sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.BindlessSurfaceStateMOCS = GENX(MOCS); + if (cmd_buffer->device->physical->use_softpin) { + sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { + .bo = device->surface_state_pool.block_pool.bo, + .offset = 0, + }; + sba.BindlessSurfaceStateSize = (1 << 20) - 1; + } else { + sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS; + sba.BindlessSurfaceStateSize = 0; + } + sba.BindlessSurfaceStateMOCS = mocs; sba.BindlessSurfaceStateBaseAddressModifyEnable = true; - sba.BindlessSurfaceStateSize = 0; # endif # if (GEN_GEN >= 10) sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.BindlessSamplerStateMOCS = GENX(MOCS); + sba.BindlessSamplerStateMOCS = mocs; sba.BindlessSamplerStateBaseAddressModifyEnable = true; sba.BindlessSamplerStateBufferSize = 0; # endif } +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Put the pipeline back into its current mode. + */ + if (gen12_wa_pipeline != UINT32_MAX) + genX(flush_pipeline_select)(cmd_buffer, gen12_wa_pipeline); +#endif + /* After re-setting the surface state base address, we have to do some * cache flusing so that the sampler engine will pick up the new * SURFACE_STATE objects and binding tables. From the Broadwell PRM, @@ -176,7 +250,7 @@ add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, VkResult result = anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.offset + isl_dev->ss.addr_offset, - addr.bo, addr.offset); + addr.bo, addr.offset, NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -195,7 +269,9 @@ add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.state.offset + isl_dev->ss.aux_addr_offset, - state.aux_address.bo, state.aux_address.offset); + state.aux_address.bo, + state.aux_address.offset, + NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -206,7 +282,9 @@ add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, &cmd_buffer->pool->alloc, state.state.offset + isl_dev->ss.clear_color_state_offset, - state.clear_address.bo, state.clear_address.offset); + state.clear_address.bo, + state.clear_address.offset, + NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -219,7 +297,7 @@ color_attachment_compute_aux_usage(struct anv_device * device, union isl_color_value *fast_clear_color) { struct anv_attachment_state *att_state = &cmd_state->attachments[att]; - struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; + struct anv_image_view *iview = cmd_state->attachments[att].image_view; assert(iview->n_planes == 1); @@ -238,6 +316,7 @@ color_attachment_compute_aux_usage(struct anv_device * device, att_state->aux_usage = anv_layout_to_aux_usage(&device->info, iview->image, VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); /* If we don't have aux, then we should have returned early in the layer @@ -260,7 +339,8 @@ color_attachment_compute_aux_usage(struct anv_device * device, * In other words, we can only sample from a fast-cleared image if it * also supports color compression. */ - if (isl_format_supports_ccs_e(&device->info, iview->planes[0].isl.format)) { + if (isl_format_supports_ccs_e(&device->info, iview->planes[0].isl.format) && + isl_format_supports_ccs_d(&device->info, iview->planes[0].isl.format)) { att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; /* While fast-clear resolves and partial resolves are fairly cheap in the @@ -271,7 +351,7 @@ color_attachment_compute_aux_usage(struct anv_device * device, */ if (cmd_state->pass->attachments[att].first_subpass_layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { - anv_perf_warn(device->instance, iview->image, + anv_perf_warn(device, iview->image, "Not temporarily enabling CCS_E."); } } else { @@ -335,13 +415,13 @@ color_attachment_compute_aux_usage(struct anv_device * device, if (att_state->fast_clear && (iview->planes[0].isl.base_level > 0 || iview->planes[0].isl.base_array_layer > 0)) { - anv_perf_warn(device->instance, iview->image, + anv_perf_warn(device, iview->image, "Rendering with multi-lod or multi-layer framebuffer " "with LOAD_OP_LOAD and baseMipLevel > 0 or " "baseArrayLayer > 0. Not fast clearing."); att_state->fast_clear = false; } else if (att_state->fast_clear && cmd_state->framebuffer->layers > 1) { - anv_perf_warn(device->instance, iview->image, + anv_perf_warn(device, iview->image, "Rendering to a multi-layer framebuffer with " "LOAD_OP_CLEAR. Only fast-clearing the first slice"); } @@ -361,12 +441,17 @@ depth_stencil_attachment_compute_aux_usage(struct anv_device *device, struct anv_render_pass_attachment *pass_att = &cmd_state->pass->attachments[att]; struct anv_attachment_state *att_state = &cmd_state->attachments[att]; - struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; + struct anv_image_view *iview = cmd_state->attachments[att].image_view; /* These will be initialized after the first subpass transition. */ att_state->aux_usage = ISL_AUX_USAGE_NONE; att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + /* This is unused for depth/stencil but valgrind complains if it + * isn't initialized + */ + att_state->clear_color_is_zero_one = false; + if (GEN_GEN == 7) { /* We don't do any HiZ or depth fast-clears on gen7 yet */ att_state->fast_clear = false; @@ -389,13 +474,13 @@ depth_stencil_attachment_compute_aux_usage(struct anv_device *device, const enum isl_aux_usage first_subpass_aux_usage = anv_layout_to_aux_usage(&device->info, iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, pass_att->first_subpass_layout); - if (first_subpass_aux_usage != ISL_AUX_USAGE_HIZ) - return; - - if (!blorp_can_hiz_clear_depth(GEN_GEN, - iview->planes[0].isl.format, - iview->image->samples, + if (!blorp_can_hiz_clear_depth(&device->info, + &iview->image->planes[0].surface.isl, + first_subpass_aux_usage, + iview->planes[0].isl.base_level, + iview->planes[0].isl.base_array_layer, render_area.offset.x, render_area.offset.y, render_area.offset.x + @@ -443,27 +528,95 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, VkImageLayout initial_layout, VkImageLayout final_layout) { - const bool hiz_enabled = ISL_AUX_USAGE_HIZ == - anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, initial_layout); - const bool enable_hiz = ISL_AUX_USAGE_HIZ == - anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, final_layout); - - enum isl_aux_op hiz_op; - if (hiz_enabled && !enable_hiz) { - hiz_op = ISL_AUX_OP_FULL_RESOLVE; - } else if (!hiz_enabled && enable_hiz) { - hiz_op = ISL_AUX_OP_AMBIGUATE; - } else { - assert(hiz_enabled == enable_hiz); - /* If the same buffer will be used, no resolves are necessary. */ - hiz_op = ISL_AUX_OP_NONE; - } + uint32_t depth_plane = + anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT); + if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE) + return; + + const enum isl_aux_state initial_state = + anv_layout_to_aux_state(&cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + initial_layout); + const enum isl_aux_state final_state = + anv_layout_to_aux_state(&cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + final_layout); + + const bool initial_depth_valid = + isl_aux_state_has_valid_primary(initial_state); + const bool initial_hiz_valid = + isl_aux_state_has_valid_aux(initial_state); + const bool final_needs_depth = + isl_aux_state_has_valid_primary(final_state); + const bool final_needs_hiz = + isl_aux_state_has_valid_aux(final_state); + + /* Getting into the pass-through state for Depth is tricky and involves + * both a resolve and an ambiguate. We don't handle that state right now + * as anv_layout_to_aux_state never returns it. + */ + assert(final_state != ISL_AUX_STATE_PASS_THROUGH); - if (hiz_op != ISL_AUX_OP_NONE) + if (final_needs_depth && !initial_depth_valid) { + assert(initial_hiz_valid); + anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, 0, 1, ISL_AUX_OP_FULL_RESOLVE); + } else if (final_needs_hiz && !initial_hiz_valid) { + assert(initial_depth_valid); anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, - 0, 0, 1, hiz_op); + 0, 0, 1, ISL_AUX_OP_AMBIGUATE); + } +} + +static inline bool +vk_image_layout_stencil_write_optimal(VkImageLayout layout) +{ + return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR; +} + +/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless + * the initial layout is undefined, the HiZ buffer and depth buffer will + * represent the same data at the end of this operation. + */ +static void +transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout) +{ +#if GEN_GEN == 7 + uint32_t plane = anv_image_aspect_to_plane(image->aspects, + VK_IMAGE_ASPECT_STENCIL_BIT); + + /* On gen7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in following + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. + */ + if (image->planes[plane].shadow_surface.isl.size_B > 0 && + vk_image_layout_stencil_write_optimal(initial_layout) && + !vk_image_layout_stencil_write_optimal(final_layout)) { + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + base_level, level_count, + base_layer, layer_count); + } +#endif /* GEN_GEN == 7 */ } #define MI_PREDICATE_SRC0 0x2400 @@ -514,23 +667,6 @@ set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); } -#if GEN_IS_HASWELL || GEN_GEN >= 8 -static inline uint32_t -mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2) -{ - struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { - .ALUOpcode = opcode, - .Operand1 = operand1, - .Operand2 = operand2, - }; - - uint32_t dw; - GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); - - return dw; -} -#endif - /* This is only really practical on haswell and above because it requires * MI math in order to get it correct. */ @@ -687,7 +823,7 @@ anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, * to do a partial resolve on a CCS_D surface. */ if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && - image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) resolve_op = ISL_AUX_OP_FULL_RESOLVE; anv_image_ccs_op(cmd_buffer, image, format, aspect, level, @@ -754,27 +890,21 @@ init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, set_image_fast_clear_state(cmd_buffer, image, aspect, ANV_FAST_CLEAR_NONE); - /* The fast clear value dword(s) will be copied into a surface state object. - * Ensure that the restrictions of the fields in the dword(s) are followed. - * - * CCS buffers on SKL+ can have any value set for the clear colors. - */ - if (image->samples == 1 && GEN_GEN >= 9) - return; - - /* Other combinations of auxiliary buffers and platforms require specific - * values in the clear value dword(s). + /* Initialize the struct fields that are accessed for fast-clears so that + * the HW restrictions on the field values are satisfied. */ struct anv_address addr = anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); if (GEN_GEN >= 9) { - for (unsigned i = 0; i < 4; i++) { + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + const unsigned num_dwords = GEN_GEN >= 10 ? + isl_dev->ss.clear_color_state_size / 4 : + isl_dev->ss.clear_value_size / 4; + for (unsigned i = 0; i < num_dwords; i++) { anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = addr; sdi.Address.offset += i * 4; - /* MCS buffers on SKL+ can only have 1/0 clear colors. */ - assert(image->samples > 1); sdi.ImmediateData = 0; } } @@ -822,12 +952,35 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; +#if GEN_GEN == 7 + /* On gen7, the combination of commands used here(MI_LOAD_REGISTER_MEM + * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is + * in-flight when they are issued even if the memory touched is not + * currently active for rendering. The weird bit is that it is not the + * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight + * rendering hangs such that the next stalling command after the + * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang. + * + * It is unclear exactly why this hang occurs. Both MI commands come with + * warnings about the 3D pipeline but that doesn't seem to fully explain + * it. My (Jason's) best theory is that it has something to do with the + * fact that we're using a GPU state register as our temporary and that + * something with reading/writing it is causing problems. + * + * In order to work around this issue, we emit a PIPE_CONTROL with the + * command streamer stall bit set. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +#endif + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + if (copy_from_surface_state) { - genX(cmd_buffer_mi_memcpy)(cmd_buffer, entry_addr, - ss_clear_addr, copy_size); + gen_mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size); } else { - genX(cmd_buffer_mi_memcpy)(cmd_buffer, ss_clear_addr, - entry_addr, copy_size); + gen_mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size); /* Updating a surface state object may require that the state cache be * invalidated. From the SKL PRM, Shared Functions -> State -> State @@ -845,6 +998,105 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, } } +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + +#if GEN_GEN == 12 +static void +anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count) +{ + uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); + + uint64_t base_address = + anv_address_physical(image->planes[plane].address); + + const struct isl_surf *isl_surf = &image->planes[plane].surface.isl; + uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf); + + /* We're about to live-update the AUX-TT. We really don't want anyone else + * trying to read it while we're doing this. We could probably get away + * with not having this stall in some cases if we were really careful but + * it's better to play it safe. Full stall the GPU. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + for (uint32_t a = 0; a < layer_count; a++) { + const uint32_t layer = base_layer + a; + + uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0; + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + uint32_t logical_array_layer, logical_z_offset_px; + if (image->type == VK_IMAGE_TYPE_3D) { + logical_array_layer = 0; + + /* If the given miplevel does not have this layer, then any higher + * miplevels won't either because miplevels only get smaller the + * higher the LOD. + */ + assert(layer < image->extent.depth); + if (layer >= anv_minify(image->extent.depth, level)) + break; + logical_z_offset_px = layer; + } else { + assert(layer < image->array_size); + logical_array_layer = layer; + logical_z_offset_px = 0; + } + + uint32_t slice_start_offset_B, slice_end_offset_B; + isl_surf_get_image_range_B_tile(isl_surf, level, + logical_array_layer, + logical_z_offset_px, + &slice_start_offset_B, + &slice_end_offset_B); + + start_offset_B = MIN2(start_offset_B, slice_start_offset_B); + end_offset_B = MAX2(end_offset_B, slice_end_offset_B); + } + + /* Aux operates 64K at a time */ + start_offset_B = align_down_u64(start_offset_B, 64 * 1024); + end_offset_B = align_u64(end_offset_B, 64 * 1024); + + for (uint64_t offset = start_offset_B; + offset < end_offset_B; offset += 64 * 1024) { + uint64_t address = base_address + offset; + + uint64_t aux_entry_addr64, *aux_entry_map; + aux_entry_map = gen_aux_map_get_entry(cmd_buffer->device->aux_map_ctx, + address, &aux_entry_addr64); + + assert(cmd_buffer->device->physical->use_softpin); + struct anv_address aux_entry_address = { + .bo = NULL, + .offset = aux_entry_addr64, + }; + + const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map); + uint64_t new_aux_entry = + (old_aux_entry & GEN_AUX_MAP_ADDRESS_MASK) | format_bits; + + if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) + new_aux_entry |= GEN_AUX_MAP_ENTRY_VALID_BIT; + + gen_mi_store(&b, gen_mi_mem64(aux_entry_address), + gen_mi_imm(new_aux_entry)); + } + } + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; +} +#endif /* GEN_GEN == 12 */ + /** * @brief Transitions a color buffer from one layout to another. * @@ -865,7 +1117,8 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, VkImageLayout initial_layout, VkImageLayout final_layout) { - const struct gen_device_info *devinfo = &cmd_buffer->device->info; + struct anv_device *device = cmd_buffer->device; + const struct gen_device_info *devinfo = &device->info; /* Validate the inputs. */ assert(cmd_buffer); assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); @@ -902,6 +1155,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, assert(isl_format_is_compressed(image->planes[plane].surface.isl.format)); assert(plane == 0); anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_COLOR_BIT, base_level, level_count, base_layer, layer_count); } @@ -909,10 +1163,20 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) return; - assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); + assert(image->planes[plane].surface.isl.tiling != ISL_TILING_LINEAR); if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { +#if GEN_GEN == 12 + if (device->physical->has_implicit_ccs && devinfo->has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, aspect, + base_level, level_count, + base_layer, layer_count); + } +#else + assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map)); +#endif + /* A subresource in the undefined layout may have been aliased and * populated with any arrangement of bits. Therefore, we must initialize * the related aux buffer and clear buffer entry with desirable values. @@ -980,7 +1244,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, } } else { if (image->samples == 4 || image->samples == 16) { - anv_perf_warn(cmd_buffer->device->instance, image, + anv_perf_warn(cmd_buffer->device, image, "Doing a potentially unnecessary fast-clear to " "define an MCS buffer."); } @@ -995,9 +1259,9 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, } const enum isl_aux_usage initial_aux_usage = - anv_layout_to_aux_usage(devinfo, image, aspect, initial_layout); + anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout); const enum isl_aux_usage final_aux_usage = - anv_layout_to_aux_usage(devinfo, image, aspect, final_layout); + anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout); /* The current code assumes that there is no mixing of CCS_E and CCS_D. * We can handle transitions between CCS_D/E to and from NONE. What we @@ -1100,6 +1364,7 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, { const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; struct anv_cmd_state *state = &cmd_buffer->state; + struct anv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; vk_free(&cmd_buffer->pool->alloc, state->attachments); @@ -1139,6 +1404,12 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, next_state.offset += ss_stride; next_state.map += ss_stride; + const VkRenderPassAttachmentBeginInfoKHR *begin_attachment = + vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR); + + if (begin && !begin_attachment) + assert(pass->attachment_count == framebuffer->attachment_count); + for (uint32_t i = 0; i < pass->attachment_count; ++i) { if (vk_format_is_color(pass->attachments[i].format)) { state->attachments[i].color.state = next_state; @@ -1151,14 +1422,19 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, next_state.offset += ss_stride; next_state.map += ss_stride; } + + if (begin_attachment && begin_attachment->attachmentCount != 0) { + assert(begin_attachment->attachmentCount == pass->attachment_count); + ANV_FROM_HANDLE(anv_image_view, iview, begin_attachment->pAttachments[i]); + cmd_buffer->state.attachments[i].image_view = iview; + } else if (framebuffer && i < framebuffer->attachment_count) { + cmd_buffer->state.attachments[i].image_view = framebuffer->attachments[i]; + } } assert(next_state.offset == state->render_pass_states.offset + state->render_pass_states.alloc_size); if (begin) { - ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer); - assert(pass->attachment_count == framebuffer->attachment_count); - isl_null_fill_state(isl_dev, state->null_surface_state.map, isl_extent3d(framebuffer->width, framebuffer->height, @@ -1196,12 +1472,13 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, } state->attachments[i].current_layout = att->initial_layout; + state->attachments[i].current_stencil_layout = att->stencil_initial_layout; state->attachments[i].pending_clear_aspects = clear_aspects; state->attachments[i].pending_load_aspects = load_aspects; if (clear_aspects) state->attachments[i].clear_value = begin->pClearValues[i]; - struct anv_image_view *iview = framebuffer->attachments[i]; + struct anv_image_view *iview = cmd_buffer->state.attachments[i].image_view; anv_assert(iview->vk_format == att->format); const uint32_t num_layers = iview->planes[0].isl.array_len; @@ -1291,9 +1568,18 @@ genX(BeginCommandBuffer)( * executing anything. The chances are fairly high that they will use * blorp at least once per primary command buffer so it shouldn't be * wasted. + * + * There is also a workaround on gen8 which requires us to invalidate the + * VF cache occasionally. It's easier if we can assume we start with a + * fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).) */ - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + + /* Re-emit the aux table register in every command buffer. This way we're + * ensured that we have the table even if this command buffer doesn't + * initialize any images. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; /* We send an "Indirect State Pointers Disable" packet at * EndCommandBuffer, so all push contant packets are ignored during a @@ -1330,7 +1616,9 @@ genX(BeginCommandBuffer)( enum isl_aux_usage aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, - VK_IMAGE_ASPECT_DEPTH_BIT, layout); + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + layout); cmd_buffer->state.hiz_enabled = aux_usage == ISL_AUX_USAGE_HIZ; } @@ -1498,6 +1786,14 @@ genX(CmdExecuteCommands)( anv_cmd_buffer_add_secondary(primary, secondary); } + /* The secondary isn't counted in our VF cache tracking so we need to + * invalidate the whole thing. + */ + if (GEN_GEN >= 8 && GEN_GEN <= 9) { + primary->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } + /* The secondary may have selected a different pipeline (3D or compute) and * may have changed the current L3$ configuration. Reset our tracking * variables to invalid values to ensure that we re-emit these in the case @@ -1506,6 +1802,7 @@ genX(CmdExecuteCommands)( */ primary->state.current_pipeline = UINT32_MAX; primary->state.current_l3_config = NULL; + primary->state.current_hash_scale = 0; /* Each of the secondary command buffers will use its own state base * address. We need to re-emit state base address for the primary after @@ -1537,7 +1834,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, gen_dump_l3_config(cfg, stderr); } - const bool has_slm = cfg->n[GEN_L3P_SLM]; + UNUSED const bool has_slm = cfg->n[GEN_L3P_SLM]; /* According to the hardware docs, the L3 partitioning can only be changed * while the pipeline is completely drained and the caches are flushed, @@ -1584,9 +1881,19 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]); +#if GEN_GEN >= 12 +#define L3_ALLOCATION_REG GENX(L3ALLOC) +#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num) +#else +#define L3_ALLOCATION_REG GENX(L3CNTLREG) +#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num) +#endif + uint32_t l3cr; - anv_pack_struct(&l3cr, GENX(L3CNTLREG), + anv_pack_struct(&l3cr, L3_ALLOCATION_REG, +#if GEN_GEN < 11 .SLMEnable = has_slm, +#endif #if GEN_GEN == 11 /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set * in L3CNTLREG register. The default setting of the bit is not the @@ -1601,7 +1908,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, .AllAllocation = cfg->n[GEN_L3P_ALL]); /* Set up the L3 partitioning. */ - emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr); + emit_lri(&cmd_buffer->batch, L3_ALLOCATION_REG_num, l3cr); #else @@ -1625,7 +1932,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]); /* Minimum number of ways that can be allocated to the URB. */ - MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; + const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; assert(cfg->n[GEN_L3P_URB] >= n0_urb); uint32_t l3sqcr1, l3cr2, l3cr3; @@ -1663,7 +1970,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3); #if GEN_IS_HASWELL - if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) { + if (cmd_buffer->device->physical->cmd_parser_version >= 4) { /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep * them disabled to avoid crashing the system hard. */ @@ -1686,8 +1993,12 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) { + UNUSED const struct gen_device_info *devinfo = &cmd_buffer->device->info; enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; + if (cmd_buffer->device->physical->always_flush_cache) + bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; + /* Flushes are pipelined while invalidations are handled immediately. * Therefore, if we're flushing anything then we need to schedule a stall * before any invalidations can happen. @@ -1704,14 +2015,80 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT; } + if (GEN_GEN >= 12 && + ((bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT) || + (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT))) { + /* From the PIPE_CONTROL instruction table, bit 28 (Tile Cache Flush + * Enable): + * + * Unified Cache (Tile Cache Disabled): + * + * When the Color and Depth (Z) streams are enabled to be cached in + * the DC space of L2, Software must use "Render Target Cache Flush + * Enable" and "Depth Cache Flush Enable" along with "Tile Cache + * Flush" for getting the color and depth (Z) write data to be + * globally observable. In this mode of operation it is not required + * to set "CS Stall" upon setting "Tile Cache Flush" bit. + */ + bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + } + + /* GEN:BUG:1409226450, Wait for EU to be idle before pipe control which + * invalidates the instruction cache + */ + if (GEN_GEN == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)) + bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + + if ((GEN_GEN >= 8 && GEN_GEN <= 9) && + (bits & ANV_PIPE_CS_STALL_BIT) && + (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) { + /* If we are doing a VF cache invalidate AND a CS stall (it must be + * both) then we can reset our vertex cache tracking. + */ + memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0, + sizeof(cmd_buffer->state.gfx.vb_dirty_ranges)); + memset(&cmd_buffer->state.gfx.ib_dirty_range, 0, + sizeof(cmd_buffer->state.gfx.ib_dirty_range)); + } + + /* Project: SKL / Argument: LRI Post Sync Operation [23] + * + * "PIPECONTROL command with “Command Streamer Stall Enable” must be + * programmed prior to programming a PIPECONTROL command with "LRI + * Post Sync Operation" in GPGPU mode of operation (i.e when + * PIPELINE_SELECT command is set to GPGPU mode of operation)." + * + * The same text exists a few rows below for Post Sync Op. + * + * On Gen12 this is GEN:BUG:1607156449. + */ + if (bits & ANV_PIPE_POST_SYNC_BIT) { + if ((GEN_GEN == 9 || (GEN_GEN == 12 && devinfo->revision == 0 /* A0 */)) && + cmd_buffer->state.current_pipeline == GPGPU) + bits |= ANV_PIPE_CS_STALL_BIT; + bits &= ~ANV_PIPE_POST_SYNC_BIT; + } + if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT; +#endif pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; pipe.RenderTargetCacheFlushEnable = bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must + * be set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ +#if GEN_GEN >= 12 + pipe.DepthStallEnable = + pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT); +#else pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; +#endif + pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; @@ -1780,9 +2157,19 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) if (GEN_GEN == 9 && pipe.VFCacheInvalidationEnable) { pipe.PostSyncOperation = WriteImmediateData; pipe.Address = - (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; + } + } + +#if GEN_GEN == 12 + if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && + cmd_buffer->device->info.has_aux_map) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num); + lri.DataDWord = 1; } } +#endif bits &= ~ANV_PIPE_INVALIDATE_BITS; } @@ -1828,24 +2215,34 @@ void genX(CmdPipelineBarrier)( const VkImageSubresourceRange *range = &pImageMemoryBarriers[i].subresourceRange; + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->extent.depth, range->baseMipLevel); + } else { + base_layer = range->baseArrayLayer; + layer_count = anv_get_layerCount(image, range); + } + if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, pImageMemoryBarriers[i].oldLayout, pImageMemoryBarriers[i].newLayout); - } else if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + } + + if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + range->baseMipLevel, + anv_get_levelCount(image, range), + base_layer, layer_count, + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout); + } + + if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { VkImageAspectFlags color_aspects = anv_image_expand_aspects(image, range->aspectMask); uint32_t aspect_bit; - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(image->extent.depth, range->baseMipLevel); - } else { - base_layer = range->baseArrayLayer; - layer_count = anv_get_layerCount(image, range); - } - anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, range->baseMipLevel, @@ -1929,34 +2326,6 @@ cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; } -static const struct anv_descriptor * -anv_descriptor_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) -{ - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - const uint32_t offset = - set->layout->binding[binding->binding].descriptor_index; - return &set->descriptors[offset + binding->index]; -} - -static uint32_t -dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) -{ - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - - uint32_t dynamic_offset_idx = - pipe_state->layout->set[binding->set].dynamic_offset_start + - set->layout->binding[binding->binding].dynamic_offset_index + - binding->index; - - return pipe_state->dynamic_offsets[dynamic_offset_idx]; -} - static struct anv_address anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, struct anv_descriptor_set *set) @@ -1964,7 +2333,7 @@ anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, if (set->pool) { /* This is a normal descriptor set */ return (struct anv_address) { - .bo = &set->pool->bo, + .bo = set->pool->bo, .offset = set->desc_mem.offset, }; } else { @@ -1982,26 +2351,37 @@ anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, } } +static struct anv_cmd_pipeline_state * +pipe_state_for_stage(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage) +{ + switch (stage) { + case MESA_SHADER_COMPUTE: + return &cmd_buffer->state.compute.base; + + case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TESS_EVAL: + case MESA_SHADER_GEOMETRY: + case MESA_SHADER_FRAGMENT: + return &cmd_buffer->state.gfx.base; + + default: + unreachable("invalid stage"); + } +} + static VkResult emit_binding_table(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage, struct anv_state *bt_state) { - const struct gen_device_info *devinfo = &cmd_buffer->device->info; struct anv_subpass *subpass = cmd_buffer->state.subpass; - struct anv_cmd_pipeline_state *pipe_state; - struct anv_pipeline *pipeline; uint32_t state_offset; - switch (stage) { - case MESA_SHADER_COMPUTE: - pipe_state = &cmd_buffer->state.compute.base; - break; - default: - pipe_state = &cmd_buffer->state.gfx.base; - break; - } - pipeline = pipe_state->pipeline; + struct anv_cmd_pipeline_state *pipe_state = + pipe_state_for_stage(cmd_buffer, stage); + struct anv_pipeline *pipeline = pipe_state->pipeline; if (!anv_pipeline_has_stage(pipeline, stage)) { *bt_state = (struct anv_state) { 0, }; @@ -2022,26 +2402,25 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, if (bt_state->map == NULL) return VK_ERROR_OUT_OF_DEVICE_MEMORY; - /* We only use push constant space for images before gen9 */ - if (map->image_param_count > 0) { - VkResult result = - anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images); - if (result != VK_SUCCESS) - return result; - - cmd_buffer->state.push_constants_dirty |= 1 << stage; - } + /* We only need to emit relocs if we're not using softpin. If we are using + * softpin then we always keep all user-allocated memory objects resident. + */ + const bool need_client_mem_relocs = + !cmd_buffer->device->physical->use_softpin; - uint32_t image = 0; for (uint32_t s = 0; s < map->surface_count; s++) { struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; struct anv_state surface_state; - if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { + switch (binding->set) { + case ANV_DESCRIPTOR_SET_NULL: + bt_map[s] = 0; + break; + + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: /* Color attachment binding */ assert(stage == MESA_SHADER_FRAGMENT); - assert(binding->binding == 0); if (binding->index < subpass->color_count) { const unsigned att = subpass->color_attachments[binding->index].attachment; @@ -2062,8 +2441,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, } bt_map[s] = surface_state.offset + state_offset; - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { + break; + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: { struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2082,12 +2462,12 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, bt_map[s] = surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, surface_state, constant_data); - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS) { + break; + } + + case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { /* This is always the first binding for compute shaders */ assert(stage == MESA_SHADER_COMPUTE && s == 0); - if (!get_cs_prog_data(pipeline)->uses_num_work_groups) - continue; struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2099,167 +2479,147 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.compute.num_workgroups, 12, 1); bt_map[s] = surface_state.offset + state_offset; - add_surface_reloc(cmd_buffer, surface_state, - cmd_buffer->state.compute.num_workgroups); - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + cmd_buffer->state.compute.num_workgroups); + } + break; + } + + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { /* This is a descriptor set buffer so the set index is actually * given by binding->binding. (Yes, that's confusing.) */ struct anv_descriptor_set *set = - pipe_state->descriptors[binding->binding]; + pipe_state->descriptors[binding->index]; assert(set->desc_mem.alloc_size); assert(set->desc_surface_state.alloc_size); bt_map[s] = set->desc_surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, set->desc_surface_state, anv_descriptor_set_address(cmd_buffer, set)); - continue; + break; } - const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); + default: { + assert(binding->set < MAX_SETS); + const struct anv_descriptor *desc = + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; - switch (desc->type) { - case VK_DESCRIPTOR_TYPE_SAMPLER: - /* Nothing for us to do here */ - continue; + switch (desc->type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + /* Nothing for us to do here */ + continue; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { - struct anv_surface_state sstate = - (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? - desc->image_view->planes[binding->plane].general_sampler_surface_state : - desc->image_view->planes[binding->plane].optimal_sampler_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - add_surface_state_relocs(cmd_buffer, sstate); - break; - } - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - assert(stage == MESA_SHADER_FRAGMENT); - if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { - /* For depth and stencil input attachments, we treat it like any - * old texture that a user may have bound. - */ + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { struct anv_surface_state sstate = (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? desc->image_view->planes[binding->plane].general_sampler_surface_state : desc->image_view->planes[binding->plane].optimal_sampler_surface_state; surface_state = sstate.state; assert(surface_state.alloc_size); - add_surface_state_relocs(cmd_buffer, sstate); - } else { - /* For color input attachments, we create the surface state at - * vkBeginRenderPass time so that we can include aux and clear - * color information. - */ - assert(binding->input_attachment_index < subpass->input_count); - const unsigned subpass_att = binding->input_attachment_index; - const unsigned att = subpass->input_attachments[subpass_att].attachment; - surface_state = cmd_buffer->state.attachments[att].input.state; + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + break; } - break; - - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { - struct anv_surface_state sstate = (binding->write_only) - ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state - : desc->image_view->planes[binding->plane].storage_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - add_surface_state_relocs(cmd_buffer, sstate); - if (devinfo->gen < 9) { - /* We only need the image params on gen8 and earlier. No image - * workarounds that require tiling information are required on - * SKL and above. - */ - assert(image < MAX_GEN8_IMAGES); - struct brw_image_param *image_param = - &cmd_buffer->state.push_constants[stage]->images[image++]; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + assert(stage == MESA_SHADER_FRAGMENT); + if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { + /* For depth and stencil input attachments, we treat it like any + * old texture that a user may have bound. + */ + assert(desc->image_view->n_planes == 1); + struct anv_surface_state sstate = + (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? + desc->image_view->planes[0].general_sampler_surface_state : + desc->image_view->planes[0].optimal_sampler_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + } else { + /* For color input attachments, we create the surface state at + * vkBeginRenderPass time so that we can include aux and clear + * color information. + */ + assert(binding->input_attachment_index < subpass->input_count); + const unsigned subpass_att = binding->input_attachment_index; + const unsigned att = subpass->input_attachments[subpass_att].attachment; + surface_state = cmd_buffer->state.attachments[att].input.state; + } + break; - *image_param = - desc->image_view->planes[binding->plane].storage_image_param; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { + struct anv_surface_state sstate = (binding->write_only) + ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state + : desc->image_view->planes[binding->plane].storage_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + break; } - break; - } - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - surface_state = desc->buffer_view->surface_state; - assert(surface_state.alloc_size); - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); - break; - - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { - /* Compute the offset within the buffer */ - uint32_t dynamic_offset = - dynamic_offset_for_binding(pipe_state, binding); - uint64_t offset = desc->offset + dynamic_offset; - /* Clamp to the buffer size */ - offset = MIN2(offset, desc->buffer->size); - /* Clamp the range to the buffer size */ - uint32_t range = MIN2(desc->range, desc->buffer->size - offset); - - struct anv_address address = - anv_address_add(desc->buffer->address, offset); - - surface_state = - anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); - enum isl_format format = - anv_isl_format_for_descriptor_type(desc->type); + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + surface_state = desc->buffer_view->surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + /* Compute the offset within the buffer */ + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + + uint32_t dynamic_offset = + push->dynamic_offsets[binding->dynamic_offset_index]; + uint64_t offset = desc->offset + dynamic_offset; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->size); + /* Clamp the range to the buffer size */ + uint32_t range = MIN2(desc->range, desc->buffer->size - offset); + + struct anv_address address = + anv_address_add(desc->buffer->address, offset); + + surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + enum isl_format format = + anv_isl_format_for_descriptor_type(desc->type); + + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, address, range, 1); + if (need_client_mem_relocs) + add_surface_reloc(cmd_buffer, surface_state, address); + break; + } - anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, - format, address, range, 1); - add_surface_reloc(cmd_buffer, surface_state, address); - break; - } + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + surface_state = (binding->write_only) + ? desc->buffer_view->writeonly_storage_surface_state + : desc->buffer_view->storage_surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - surface_state = (binding->write_only) - ? desc->buffer_view->writeonly_storage_surface_state - : desc->buffer_view->storage_surface_state; - assert(surface_state.alloc_size); - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); - if (devinfo->gen < 9) { - assert(image < MAX_GEN8_IMAGES); - struct brw_image_param *image_param = - &cmd_buffer->state.push_constants[stage]->images[image++]; - - *image_param = desc->buffer_view->storage_image_param; + default: + assert(!"Invalid descriptor type"); + continue; } + bt_map[s] = surface_state.offset + state_offset; break; - - default: - assert(!"Invalid descriptor type"); - continue; } - - bt_map[s] = surface_state.offset + state_offset; - } - assert(image == map->image_param_count); - -#if GEN_GEN >= 11 - /* The PIPE_CONTROL command description says: - * - * "Whenever a Binding Table Index (BTI) used by a Render Taget Message - * points to a different RENDER_SURFACE_STATE, SW must issue a Render - * Target Cache Flush by enabling this bit. When render target flush - * is set due to new association of BTI, PS Scoreboard Stall bit must - * be set in this packet." - * - * FINISHME: Currently we shuffle around the surface states in the binding - * table based on if they are getting used or not. So, we've to do below - * pipe control flush for every binding table upload. Make changes so - * that we do it only when we modify render target surface states. - */ - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.RenderTargetCacheFlushEnable = true; - pc.StallAtPixelScoreboard = true; + } } -#endif return VK_SUCCESS; } @@ -2270,8 +2630,7 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, struct anv_state *state) { struct anv_cmd_pipeline_state *pipe_state = - stage == MESA_SHADER_COMPUTE ? &cmd_buffer->state.compute.base : - &cmd_buffer->state.gfx.base; + pipe_state_for_stage(cmd_buffer, stage); struct anv_pipeline *pipeline = pipe_state->pipeline; if (!anv_pipeline_has_stage(pipeline, stage)) { @@ -2294,7 +2653,7 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, for (uint32_t s = 0; s < map->sampler_count; s++) { struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) @@ -2316,10 +2675,9 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, } static uint32_t -flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer) +flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, + struct anv_pipeline *pipeline) { - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; - VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty & pipeline->active_stages; @@ -2412,9 +2770,61 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, } } +#if GEN_GEN >= 8 || GEN_IS_HASWELL +static struct anv_address +get_push_range_address(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_push_range *range) +{ + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + switch (range->set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + /* This is a descriptor set buffer so the set index is + * actually given by binding->binding. (Yes, that's + * confusing.) + */ + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->index]; + return anv_descriptor_set_address(cmd_buffer, set); + break; + } + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: { + struct anv_state state = + anv_cmd_buffer_push_constants(cmd_buffer, stage); + return (struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = state.offset, + }; + break; + } + + default: { + assert(range->set < MAX_SETS); + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->set]; + const struct anv_descriptor *desc = + &set->descriptors[range->index]; + + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + return desc->buffer_view->address; + } else { + assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + uint32_t dynamic_offset = + push->dynamic_offsets[range->dynamic_offset_index]; + return anv_address_add(desc->buffer->address, + desc->offset + dynamic_offset); + } + } + } +} +#endif + static void -cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, - VkShaderStageFlags dirty_stages) +cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, unsigned buffer_count) { const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; const struct anv_pipeline *pipeline = gfx_state->base.pipeline; @@ -2428,141 +2838,188 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, [MESA_SHADER_COMPUTE] = 0, }; - VkShaderStageFlags flushed = 0; + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + assert(push_constant_opcodes[stage] > 0); - anv_foreach_stage(stage, dirty_stages) { - assert(stage < ARRAY_SIZE(push_constant_opcodes)); - assert(push_constant_opcodes[stage] > 0); + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { + c._3DCommandSubOpcode = push_constant_opcodes[stage]; - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { - c._3DCommandSubOpcode = push_constant_opcodes[stage]; + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; - if (anv_pipeline_has_stage(pipeline, stage)) { -#if GEN_GEN >= 8 || GEN_IS_HASWELL - const struct brw_stage_prog_data *prog_data = - pipeline->shaders[stage]->prog_data; - const struct anv_pipeline_bind_map *bind_map = - &pipeline->shaders[stage]->bind_map; +#if GEN_GEN >= 12 + c.MOCS = cmd_buffer->device->isl_dev.mocs.internal; +#endif - /* The Skylake PRM contains the following restriction: - * - * "The driver must ensure The following case does not occur - * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with - * buffer 3 read length equal to zero committed followed by a - * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to - * zero committed." - * - * To avoid this, we program the buffers in the highest slots. - * This way, slot 0 is only used if slot 3 is also used. - */ - int n = 3; - - for (int i = 3; i >= 0; i--) { - const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; - if (range->length == 0) - continue; - - const unsigned surface = - prog_data->binding_table.ubo_start + range->block; - - assert(surface <= bind_map->surface_count); - const struct anv_pipeline_binding *binding = - &bind_map->surface_to_descriptor[surface]; - - struct anv_address read_addr; - uint32_t read_len; - if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { - struct anv_address constant_data = { - .bo = pipeline->device->dynamic_state_pool.block_pool.bo, - .offset = pipeline->shaders[stage]->constant_data.offset, - }; - unsigned constant_data_size = - pipeline->shaders[stage]->constant_data_size; - - read_len = MIN2(range->length, - DIV_ROUND_UP(constant_data_size, 32) - range->start); - read_addr = anv_address_add(constant_data, - range->start * 32); - } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { - /* This is a descriptor set buffer so the set index is - * actually given by binding->binding. (Yes, that's - * confusing.) - */ - struct anv_descriptor_set *set = - gfx_state->base.descriptors[binding->binding]; - struct anv_address desc_buffer_addr = - anv_descriptor_set_address(cmd_buffer, set); - const unsigned desc_buffer_size = set->desc_mem.alloc_size; - - read_len = MIN2(range->length, - DIV_ROUND_UP(desc_buffer_size, 32) - range->start); - read_addr = anv_address_add(desc_buffer_addr, - range->start * 32); - } else { - const struct anv_descriptor *desc = - anv_descriptor_for_binding(&gfx_state->base, binding); - - if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { - read_len = MIN2(range->length, - DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start); - read_addr = anv_address_add(desc->buffer_view->address, - range->start * 32); - } else { - assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); - - uint32_t dynamic_offset = - dynamic_offset_for_binding(&gfx_state->base, binding); - uint32_t buf_offset = - MIN2(desc->offset + dynamic_offset, desc->buffer->size); - uint32_t buf_range = - MIN2(desc->range, desc->buffer->size - buf_offset); - - read_len = MIN2(range->length, - DIV_ROUND_UP(buf_range, 32) - range->start); - read_addr = anv_address_add(desc->buffer->address, - buf_offset + range->start * 32); - } - } +#if GEN_GEN >= 8 || GEN_IS_HASWELL + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + assert(buffer_count <= 4); + const unsigned shift = 4 - buffer_count; + for (unsigned i = 0; i < buffer_count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; - if (read_len > 0) { - c.ConstantBody.Buffer[n] = read_addr; - c.ConstantBody.ReadLength[n] = read_len; - n--; - } - } + /* At this point we only have non-empty ranges */ + assert(range->length > 0); - struct anv_state state = - anv_cmd_buffer_push_constants(cmd_buffer, stage); + /* For Ivy Bridge, make sure we only set the first range (actual + * push constants) + */ + assert((GEN_GEN >= 8 || GEN_IS_HASWELL) || i == 0); - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[n] = (struct anv_address) { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = state.offset, - }; - c.ConstantBody.ReadLength[n] = - DIV_ROUND_UP(state.alloc_size, 32); - } + const struct anv_address addr = + get_push_range_address(cmd_buffer, stage, range); + c.ConstantBody.ReadLength[i + shift] = range->length; + c.ConstantBody.Buffer[i + shift] = + anv_address_add(addr, range->start * 32); + } #else - /* For Ivy Bridge, the push constants packets have a different - * rule that would require us to iterate in the other direction - * and possibly mess around with dynamic state base address. - * Don't bother; just emit regular push constants at n = 0. - */ + /* For Ivy Bridge, push constants are relative to dynamic state + * base address and we only ever push actual push constants. + */ + if (bind_map->push_ranges[0].length > 0) { + assert(bind_map->push_ranges[0].set == + ANV_DESCRIPTOR_SET_PUSH_CONSTANTS); struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage); + c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length; + c.ConstantBody.Buffer[0].bo = NULL; + c.ConstantBody.Buffer[0].offset = state.offset; + } + assert(bind_map->push_ranges[1].length == 0); + assert(bind_map->push_ranges[2].length == 0); + assert(bind_map->push_ranges[3].length == 0); +#endif + } + } +} - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[0].offset = state.offset, - c.ConstantBody.ReadLength[0] = - DIV_ROUND_UP(state.alloc_size, 32); - } +#if GEN_GEN >= 12 +static void +cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, + uint32_t shader_mask, uint32_t count) +{ + if (count == 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { + c.ShaderUpdateEnable = shader_mask; + c.MOCS = cmd_buffer->device->isl_dev.mocs.internal; + } + return; + } + + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_pipeline *pipeline = gfx_state->base.pipeline; + + static const uint32_t push_constant_opcodes[] = { + [MESA_SHADER_VERTEX] = 21, + [MESA_SHADER_TESS_CTRL] = 25, /* HS */ + [MESA_SHADER_TESS_EVAL] = 26, /* DS */ + [MESA_SHADER_GEOMETRY] = 22, + [MESA_SHADER_FRAGMENT] = 23, + [MESA_SHADER_COMPUTE] = 0, + }; + + gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask); + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + assert(push_constant_opcodes[stage] > 0); + + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + + uint32_t *dw; + const uint32_t buffers = (1 << count) - 1; + const uint32_t num_dwords = 2 + 2 * count; + + dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords, + GENX(3DSTATE_CONSTANT_ALL), + .ShaderUpdateEnable = shader_mask, + .PointerBufferMask = buffers, + .MOCS = cmd_buffer->device->isl_dev.mocs.internal); + + for (int i = 0; i < count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + const struct anv_address addr = + get_push_range_address(cmd_buffer, stage, range); + + GENX(3DSTATE_CONSTANT_ALL_DATA_pack)( + &cmd_buffer->batch, dw + 2 + i * 2, + &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) { + .PointerToConstantBuffer = anv_address_add(addr, range->start * 32), + .ConstantBufferReadLength = range->length, + }); + } +} #endif + +static void +cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, + VkShaderStageFlags dirty_stages) +{ + VkShaderStageFlags flushed = 0; + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_pipeline *pipeline = gfx_state->base.pipeline; + +#if GEN_GEN >= 12 + uint32_t nobuffer_stages = 0; +#endif + + anv_foreach_stage(stage, dirty_stages) { + unsigned buffer_count = 0; + flushed |= mesa_to_vk_shader_stage(stage); + uint32_t max_push_range = 0; + + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + + for (unsigned i = 0; i < 4; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + if (range->length > 0) { + buffer_count++; + if (GEN_GEN >= 12 && range->length > max_push_range) + max_push_range = range->length; + } } } - flushed |= mesa_to_vk_shader_stage(stage); +#if GEN_GEN >= 12 + /* If this stage doesn't have any push constants, emit it later in a + * single CONSTANT_ALL packet. + */ + if (buffer_count == 0) { + nobuffer_stages |= 1 << stage; + continue; + } + + /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL + * contains only 5 bits, so we can only use it for buffers smaller than + * 32. + */ + if (max_push_range < 32) { + cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage, + buffer_count); + continue; + } +#endif + + cmd_buffer_emit_push_constant(cmd_buffer, stage, buffer_count); } +#if GEN_GEN >= 12 + if (nobuffer_stages) + cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, 0); +#endif + cmd_buffer->state.push_constants_dirty &= ~flushed; } @@ -2578,7 +3035,9 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); - genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->l3_config); + + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); genX(flush_pipeline_select_3d)(cmd_buffer); @@ -2613,6 +3072,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) #endif }; +#if GEN_GEN >= 8 && GEN_GEN <= 9 + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb, + state.BufferStartingAddress, + state.BufferSize); +#endif + GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); i++; } @@ -2628,11 +3093,16 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { +#if GEN_GEN < 12 sob.SOBufferIndex = idx; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx; +#endif if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { sob.SOBufferEnable = true; - sob.MOCS = cmd_buffer->device->default_mocs, + sob.MOCS = cmd_buffer->device->isl_dev.mocs.internal, sob.StreamOffsetWriteEnable = false; sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, xfb->offset); @@ -2651,11 +3121,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); - /* The exact descriptor layout is pulled from the pipeline, so we need - * to re-emit binding tables on every pipeline change. - */ - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; - /* If the pipeline changed, we may need to re-allocate push constant * space in the URB. */ @@ -2679,7 +3144,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) pc.DepthStallEnable = true; pc.PostSyncOperation = WriteImmediateData; pc.Address = - (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; } } #endif @@ -2697,7 +3162,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) */ uint32_t dirty = 0; if (cmd_buffer->state.descriptors_dirty) - dirty = flush_descriptor_sets(cmd_buffer); + dirty = flush_descriptor_sets(cmd_buffer, pipeline); if (dirty || cmd_buffer->state.push_constants_dirty) { /* Because we're pushing UBOs, we have to push whenever either @@ -2725,8 +3190,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) gen7_cmd_buffer_emit_scissor(cmd_buffer); genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); - - genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } static void @@ -2742,7 +3205,8 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, .VertexBufferIndex = index, .AddressModifyEnable = true, .BufferPitch = 0, - .MOCS = anv_mocs_for_bo(cmd_buffer->device, addr.bo), + .MOCS = addr.bo ? anv_mocs_for_bo(cmd_buffer->device, addr.bo) : 0, + .NullVertexBuffer = size == 0, #if (GEN_GEN >= 8) .BufferStartingAddress = addr, .BufferSize = size @@ -2751,31 +3215,38 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, .EndAddress = anv_address_add(addr, size), #endif }); + + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, + index, addr, size); } static void emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr) { - emit_vertex_bo(cmd_buffer, addr, 8, ANV_SVGS_VB_INDEX); + emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX); } static void emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, uint32_t base_vertex, uint32_t base_instance) { - struct anv_state id_state = - anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); + if (base_vertex == 0 && base_instance == 0) { + emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS); + } else { + struct anv_state id_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); - ((uint32_t *)id_state.map)[0] = base_vertex; - ((uint32_t *)id_state.map)[1] = base_instance; + ((uint32_t *)id_state.map)[0] = base_vertex; + ((uint32_t *)id_state.map)[1] = base_instance; - struct anv_address addr = { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = id_state.offset, - }; + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = id_state.offset, + }; - emit_base_vertex_instance_bo(cmd_buffer, addr); + emit_base_vertex_instance_bo(cmd_buffer, addr); + } } static void @@ -2794,6 +3265,25 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); } +static void +update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type) +{ + struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + uint64_t vb_used = pipeline->vb_used; + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + vb_used |= 1ull << ANV_SVGS_VB_INDEX; + if (vs_prog_data->uses_drawid) + vb_used |= 1ull << ANV_DRAWID_VB_INDEX; + + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, + access_type == RANDOM, + vb_used); +} + void genX(CmdDraw)( VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -2819,6 +3309,11 @@ void genX(CmdDraw)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ @@ -2834,6 +3329,8 @@ void genX(CmdDraw)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = 0; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); } void genX(CmdDrawIndexed)( @@ -2862,6 +3359,11 @@ void genX(CmdDrawIndexed)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ @@ -2877,6 +3379,8 @@ void genX(CmdDrawIndexed)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = vertexOffset; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); } /* Auto-Draw / Indirect Registers */ @@ -2916,6 +3420,11 @@ void genX(CmdDrawIndirectByteCountEXT)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ @@ -2927,7 +3436,7 @@ void genX(CmdDrawIndirectByteCountEXT)( gen_mi_mem32(anv_address_add(counter_buffer->address, counterBufferOffset)); if (counterOffset) - count = gen_mi_iadd(&b, count, gen_mi_imm(-counterOffset)); + count = gen_mi_isub(&b, count, gen_mi_imm(counterOffset)); count = gen_mi_udiv32_imm(&b, count, vertexStride); gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_VERTEX_COUNT), count); @@ -2944,6 +3453,8 @@ void genX(CmdDrawIndirectByteCountEXT)( prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); #endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ } @@ -3014,6 +3525,11 @@ void genX(CmdDrawIndirect)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, false); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3023,6 +3539,8 @@ void genX(CmdDrawIndirect)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + offset += stride; } } @@ -3057,6 +3575,11 @@ void genX(CmdDrawIndexedIndirect)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3066,6 +3589,8 @@ void genX(CmdDrawIndexedIndirect)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + offset += stride; } } @@ -3160,7 +3685,7 @@ emit_draw_count_predicate_with_conditional_render( } #endif -void genX(CmdDrawIndirectCountKHR)( +void genX(CmdDrawIndirectCount)( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, @@ -3206,6 +3731,11 @@ void genX(CmdDrawIndirectCountKHR)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, false); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3215,11 +3745,13 @@ void genX(CmdDrawIndirectCountKHR)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + offset += stride; } } -void genX(CmdDrawIndexedIndirectCountKHR)( +void genX(CmdDrawIndexedIndirectCount)( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, @@ -3266,6 +3798,11 @@ void genX(CmdDrawIndexedIndirectCountKHR)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3275,6 +3812,8 @@ void genX(CmdDrawIndexedIndirectCountKHR)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + offset += stride; } } @@ -3380,71 +3919,14 @@ void genX(CmdEndTransformFeedbackEXT)( cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; } -static VkResult -flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) -{ - struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - struct anv_state surfaces = { 0, }, samplers = { 0, }; - VkResult result; - - result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); - if (result != VK_SUCCESS) { - assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); - - result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); - if (result != VK_SUCCESS) - return result; - - /* Re-emit state base addresses so we get the new surface state base - * address before we start emitting binding tables etc. - */ - genX(cmd_buffer_emit_state_base_address)(cmd_buffer); - - result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); - if (result != VK_SUCCESS) { - anv_batch_set_error(&cmd_buffer->batch, result); - return result; - } - } - - result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); - if (result != VK_SUCCESS) { - anv_batch_set_error(&cmd_buffer->batch, result); - return result; - } - - uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; - struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { - .BindingTablePointer = surfaces.offset, - .SamplerStatePointer = samplers.offset, - }; - GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); - - struct anv_state state = - anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, - pipeline->interface_descriptor_data, - GENX(INTERFACE_DESCRIPTOR_DATA_length), - 64); - - uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); - anv_batch_emit(&cmd_buffer->batch, - GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { - mid.InterfaceDescriptorTotalLength = size; - mid.InterfaceDescriptorDataStartAddress = state.offset; - } - - return VK_SUCCESS; -} - void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - MAYBE_UNUSED VkResult result; assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); - genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->l3_config); genX(flush_pipeline_select_gpgpu)(cmd_buffer); @@ -3461,16 +3943,38 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); + + /* The workgroup size of the pipeline affects our push constant layout + * so flag push constants as dirty if we change the pipeline. + */ + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || cmd_buffer->state.compute.pipeline_dirty) { - /* FIXME: figure out descriptors for gen7 */ - result = flush_compute_descriptor_set(cmd_buffer); - if (result != VK_SUCCESS) - return; + flush_descriptor_sets(cmd_buffer, pipeline); + + uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; + struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { + .BindingTablePointer = + cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, + .SamplerStatePointer = + cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, + }; + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); + + struct anv_state state = + anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, + pipeline->interface_descriptor_data, + GENX(INTERFACE_DESCRIPTOR_DATA_length), + 64); - cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; + uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); + anv_batch_emit(&cmd_buffer->batch, + GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { + mid.InterfaceDescriptorTotalLength = size; + mid.InterfaceDescriptorDataStartAddress = state.offset; + } } if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { @@ -3499,8 +4003,8 @@ verify_cmd_parser(const struct anv_device *device, int required_version, const char *function) { - if (device->instance->physicalDevice.cmd_parser_version < required_version) { - return vk_errorf(device->instance, device->instance, + if (device->physical->cmd_parser_version < required_version) { + return vk_errorf(device, device->physical, VK_ERROR_FEATURE_NOT_PRESENT, "cmd parser version %d is required for %s", required_version, function); @@ -3520,22 +4024,14 @@ anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, if (anv_batch_has_error(&cmd_buffer->batch)) return; - VkResult result = - anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, MESA_SHADER_COMPUTE, - base_work_group_id); - if (result != VK_SUCCESS) { - cmd_buffer->batch.status = result; - return; - } - struct anv_push_constants *push = - cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; - if (push->base_work_group_id[0] != baseGroupX || - push->base_work_group_id[1] != baseGroupY || - push->base_work_group_id[2] != baseGroupZ) { - push->base_work_group_id[0] = baseGroupX; - push->base_work_group_id[1] = baseGroupY; - push->base_work_group_id[2] = baseGroupZ; + &cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; + if (push->cs.base_work_group_id[0] != baseGroupX || + push->cs.base_work_group_id[1] != baseGroupY || + push->cs.base_work_group_id[2] != baseGroupZ) { + push->cs.base_work_group_id[0] = baseGroupX; + push->cs.base_work_group_id[1] = baseGroupY; + push->cs.base_work_group_id[2] = baseGroupZ; cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } @@ -3580,6 +4076,9 @@ void genX(CmdDispatchBase)( .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, .offset = state.offset, }; + + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } genX(cmd_buffer_flush_compute_state)(cmd_buffer); @@ -3630,9 +4129,13 @@ void genX(CmdDispatchIndirect)( return; #endif - if (prog_data->uses_num_work_groups) + if (prog_data->uses_num_work_groups) { cmd_buffer->state.compute.num_workgroups = addr; + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + } + genX(cmd_buffer_flush_compute_state)(cmd_buffer); struct gen_mi_builder b; @@ -3735,6 +4238,32 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); #endif +#if GEN_GEN == 9 + if (pipeline == _3D) { + /* There is a mid-object preemption workaround which requires you to + * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However, + * even without preemption, we have issues with geometry flickering when + * GPGPU and 3D are back-to-back and this seems to fix it. We don't + * really know why. + */ + const uint32_t subslices = + MAX2(cmd_buffer->device->physical->subslice_total, 1); + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) { + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * subslices - 1; + vfe.NumberofURBEntries = 2; + vfe.URBEntryAllocationSize = 2; + } + + /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is + * invalid. Set the compute pipeline to dirty to force a re-emit of the + * pipeline in case we get back-to-back dispatch calls with the same + * pipeline and a PIPELINE_SELECT in between. + */ + cmd_buffer->state.compute.pipeline_dirty = true; + } +#endif + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] * PIPELINE_SELECT [DevBWR+]": * @@ -3751,6 +4280,14 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, pc.DCFlushEnable = true; pc.PostSyncOperation = NoWrite; pc.CommandStreamerStallEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; + + /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be + * set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ + pc.DepthStallEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { @@ -3759,6 +4296,9 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, pc.StateCacheInvalidationEnable = true; pc.InstructionCacheInvalidateEnable = true; pc.PostSyncOperation = NoWrite; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { @@ -3825,12 +4365,221 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthCacheFlushEnable = true; +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthStallEnable = true; } } +/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: + * + * "The VF cache needs to be invalidated before binding and then using + * Vertex Buffers that overlap with any previously bound Vertex Buffer + * (at a 64B granularity) since the last invalidation. A VF cache + * invalidate is performed by setting the "VF Cache Invalidation Enable" + * bit in PIPE_CONTROL." + * + * This is implemented by carefully tracking all vertex and index buffer + * bindings and flushing if the cache ever ends up with a range in the cache + * that would exceed 4 GiB. This is implemented in three parts: + * + * 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called + * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the + * tracking code of the new binding. If this new binding would cause + * the cache to have a too-large range on the next draw call, a pipeline + * stall and VF cache invalidate are added to pending_pipeline_bits. + * + * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to + * empty whenever we emit a VF invalidate. + * + * 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called + * after every 3DPRIMITIVE and copies the bound range into the dirty + * range for each used buffer. This has to be a separate step because + * we don't always re-bind all buffers and so 1. can't know which + * buffers are actually bound. + */ +void +genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size) +{ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->physical->use_softpin) + return; + + struct anv_vb_cache_range *bound, *dirty; + if (vb_index == -1) { + bound = &cmd_buffer->state.gfx.ib_bound_range; + dirty = &cmd_buffer->state.gfx.ib_dirty_range; + } else { + assert(vb_index >= 0); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index]; + } + + if (vb_size == 0) { + bound->start = 0; + bound->end = 0; + return; + } + + assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED)); + bound->start = gen_48b_address(anv_address_physical(vb_address)); + bound->end = bound->start + vb_size; + assert(bound->end > bound->start); /* No overflow */ + + /* Align everything to a cache line */ + bound->start &= ~(64ull - 1ull); + bound->end = align_u64(bound->end, 64); + + /* Compute the dirty range */ + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + + /* If our range is larger than 32 bits, we have to flush */ + assert(bound->end - bound->start <= (1ull << 32)); + if (dirty->end - dirty->start > (1ull << 32)) { + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } +} + +void +genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used) +{ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->physical->use_softpin) + return; + + if (access_type == RANDOM) { + /* We have an index buffer */ + struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range; + struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } + + uint64_t mask = vb_used; + while (mask) { + int i = u_bit_scan64(&mask); + assert(i >= 0); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + + struct anv_vb_cache_range *bound, *dirty; + bound = &cmd_buffer->state.gfx.vb_bound_ranges[i]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i]; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } +} + +/** + * Update the pixel hashing modes that determine the balancing of PS threads + * across subslices and slices. + * + * \param width Width bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param height Height bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param scale The number of framebuffer samples that could potentially be + * affected by an individual channel of the PS thread. This is + * typically one for single-sampled rendering, but for operations + * like CCS resolves and fast clears a single PS invocation may + * update a huge number of pixels, in which case a finer + * balancing is desirable in order to maximally utilize the + * bandwidth available. UINT_MAX can be used as shorthand for + * "finest hashing mode available". + */ +void +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, + unsigned width, unsigned height, + unsigned scale) +{ +#if GEN_GEN == 9 + const struct gen_device_info *devinfo = &cmd_buffer->device->info; + const unsigned slice_hashing[] = { + /* Because all Gen9 platforms with more than one slice require + * three-way subslice hashing, a single "normal" 16x16 slice hashing + * block is guaranteed to suffer from substantial imbalance, with one + * subslice receiving twice as much work as the other two in the + * slice. + * + * The performance impact of that would be particularly severe when + * three-way hashing is also in use for slice balancing (which is the + * case for all Gen9 GT4 platforms), because one of the slices + * receives one every three 16x16 blocks in either direction, which + * is roughly the periodicity of the underlying subslice imbalance + * pattern ("roughly" because in reality the hardware's + * implementation of three-way hashing doesn't do exact modulo 3 + * arithmetic, which somewhat decreases the magnitude of this effect + * in practice). This leads to a systematic subslice imbalance + * within that slice regardless of the size of the primitive. The + * 32x32 hashing mode guarantees that the subslice imbalance within a + * single slice hashing block is minimal, largely eliminating this + * effect. + */ + _32x32, + /* Finest slice hashing mode available. */ + NORMAL + }; + const unsigned subslice_hashing[] = { + /* 16x16 would provide a slight cache locality benefit especially + * visible in the sampler L1 cache efficiency of low-bandwidth + * non-LLC platforms, but it comes at the cost of greater subslice + * imbalance for primitives of dimensions approximately intermediate + * between 16x4 and 16x16. + */ + _16x4, + /* Finest subslice hashing mode available. */ + _8x4 + }; + /* Dimensions of the smallest hashing block of a given hashing mode. If + * the rendering area is smaller than this there can't possibly be any + * benefit from switching to this mode, so we optimize out the + * transition. + */ + const unsigned min_size[][2] = { + { 16, 4 }, + { 8, 4 } + }; + const unsigned idx = scale > 1; + + if (cmd_buffer->state.current_hash_scale != scale && + (width > min_size[idx][0] || height > min_size[idx][1])) { + uint32_t gt_mode; + + anv_pack_struct(>_mode, GENX(GT_MODE), + .SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0), + .SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0), + .SubsliceHashing = subslice_hashing[idx], + .SubsliceHashingMask = -1); + + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode); + + cmd_buffer->state.current_hash_scale = scale; + } +#endif +} + static void cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) { @@ -3905,6 +4654,23 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); + if (GEN_GEN >= 12) { + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* GEN:BUG:1408224581 + * + * Workaround: Gen12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.PostSyncOperation = WriteImmediateData; + pc.Address = + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; + } + } cmd_buffer->state.hiz_enabled = info.hiz_usage == ISL_AUX_USAGE_HIZ; } @@ -3992,7 +4758,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, assert(a < cmd_state->pass->attachment_count); struct anv_attachment_state *att_state = &cmd_state->attachments[a]; - struct anv_image_view *iview = fb->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view; const struct anv_image *image = iview->image; /* A resolve is necessary before use as an input attachment if the clear @@ -4015,31 +4781,46 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, target_layout = subpass->attachments[i].layout; } - if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { - assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + VkImageLayout target_stencil_layout = + subpass->attachments[i].stencil_layout; - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(iview->image->extent.depth, - iview->planes[0].isl.base_level); - } else { - base_layer = iview->planes[0].isl.base_array_layer; - layer_count = fb->layers; - } + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } + if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, iview->planes[0].isl.base_level, 1, base_layer, layer_count, att_state->current_layout, target_layout); - } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + } + + if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, att_state->current_layout, target_layout); att_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, target_layout); + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + target_layout); + } + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_stencil_layout, + target_stencil_layout); } att_state->current_layout = target_layout; + att_state->current_stencil_layout = target_stencil_layout; if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); @@ -4191,7 +4972,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, if (GEN_GEN < 10 && (att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && - image->planes[0].aux_surface.isl.size_B > 0 && + image->planes[0].aux_usage != ISL_AUX_USAGE_NONE && iview->planes[0].isl.base_level == 0 && iview->planes[0].isl.base_array_layer == 0) { if (att_state->aux_usage != ISL_AUX_USAGE_NONE) { @@ -4264,6 +5045,20 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, } cmd_buffer_emit_depth_stencil(cmd_buffer); + +#if GEN_GEN >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Taget Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT; +#endif } static enum blorp_filter @@ -4321,8 +5116,8 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; } - struct anv_image_view *src_iview = fb->attachments[src_att]; - struct anv_image_view *dst_iview = fb->attachments[dst_att]; + struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; + struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; const VkRect2D render_area = cmd_buffer->state.render_area; @@ -4376,31 +5171,32 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; } - struct anv_image_view *src_iview = fb->attachments[src_att]; - struct anv_image_view *dst_iview = fb->attachments[dst_att]; + struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; + struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; const VkRect2D render_area = cmd_buffer->state.render_area; + struct anv_attachment_state *src_state = + &cmd_state->attachments[src_att]; + struct anv_attachment_state *dst_state = + &cmd_state->attachments[dst_att]; + if ((src_iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { - struct anv_attachment_state *src_state = - &cmd_state->attachments[src_att]; - struct anv_attachment_state *dst_state = - &cmd_state->attachments[dst_att]; - /* MSAA resolves sample from the source attachment. Transition the * depth attachment first to get rid of any HiZ that we may not be * able to handle. */ transition_depth_buffer(cmd_buffer, src_iview->image, src_state->current_layout, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); src_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - src_state->current_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; /* MSAA resolves write to the resolve attachment as if it were any * other transfer op. Transition the resolve attachment accordingly. @@ -4423,6 +5219,7 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) dst_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; @@ -4447,6 +5244,9 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) if ((src_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { + src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE; enum isl_aux_usage dst_aux_usage = ISL_AUX_USAGE_NONE; @@ -4469,6 +5269,48 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) } } +#if GEN_GEN == 7 + /* On gen7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in following + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. + */ + if (subpass->depth_stencil_attachment) { + uint32_t a = subpass->depth_stencil_attachment->attachment; + assert(a != VK_ATTACHMENT_UNUSED); + + struct anv_attachment_state *att_state = &cmd_state->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view;; + const struct anv_image *image = iview->image; + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + uint32_t plane = anv_image_aspect_to_plane(image->aspects, + VK_IMAGE_ASPECT_STENCIL_BIT); + + if (image->planes[plane].shadow_surface.isl.size_B > 0 && + att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) { + assert(image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + iview->planes[plane].isl.base_level, 1, + iview->planes[plane].isl.base_array_layer, + fb->layers); + } + } + } +#endif /* GEN_GEN == 7 */ + for (uint32_t i = 0; i < subpass->attachment_count; ++i) { const uint32_t a = subpass->attachments[i].attachment; if (a == VK_ATTACHMENT_UNUSED) @@ -4479,7 +5321,7 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) assert(a < cmd_state->pass->attachment_count); struct anv_attachment_state *att_state = &cmd_state->attachments[a]; - struct anv_image_view *iview = fb->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view; const struct anv_image *image = iview->image; if ((image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && @@ -4495,7 +5337,7 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) * SRGB view & a UNORM image). */ if (fast_clear_type != ANV_FAST_CLEAR_NONE) { - anv_perf_warn(cmd_buffer->device->instance, fb, + anv_perf_warn(cmd_buffer->device, iview, "Doing a partial resolve to get rid of clear color at the " "end of a renderpass due to an image/view format mismatch"); @@ -4534,28 +5376,39 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) /* Transition the image into the final layout for this render pass */ VkImageLayout target_layout = cmd_state->pass->attachments[a].final_layout; + VkImageLayout target_stencil_layout = + cmd_state->pass->attachments[a].stencil_final_layout; + + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(iview->image->extent.depth, - iview->planes[0].isl.base_level); - } else { - base_layer = iview->planes[0].isl.base_array_layer; - layer_count = fb->layers; - } - transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, iview->planes[0].isl.base_level, 1, base_layer, layer_count, att_state->current_layout, target_layout); - } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + } + + if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, att_state->current_layout, target_layout); } + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_stencil_layout, + target_stencil_layout); + } } /* Accumulate any subpass flushes that need to happen after the subpass. @@ -4593,7 +5446,7 @@ void genX(CmdBeginRenderPass)( cmd_buffer_begin_subpass(cmd_buffer, 0); } -void genX(CmdBeginRenderPass2KHR)( +void genX(CmdBeginRenderPass2)( VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBeginInfo, const VkSubpassBeginInfoKHR* pSubpassBeginInfo) @@ -4618,7 +5471,7 @@ void genX(CmdNextSubpass)( cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); } -void genX(CmdNextSubpass2KHR)( +void genX(CmdNextSubpass2)( VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo) @@ -4639,7 +5492,7 @@ void genX(CmdEndRenderPass)( cmd_buffer->state.hiz_enabled = false; #ifndef NDEBUG - anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); + anv_dump_add_attachments(cmd_buffer); #endif /* Remove references to render pass specific state. This enables us to @@ -4650,7 +5503,7 @@ void genX(CmdEndRenderPass)( cmd_buffer->state.subpass = NULL; } -void genX(CmdEndRenderPass2KHR)( +void genX(CmdEndRenderPass2)( VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo) { @@ -4728,3 +5581,170 @@ void genX(CmdEndConditionalRenderingEXT)( cmd_state->conditional_render_enabled = false; } #endif + +/* Set of stage bits for which are pipelined, i.e. they get queued by the + * command streamer for later execution. + */ +#define ANV_PIPELINE_STAGE_PIPELINED_BITS \ + (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | \ + VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | \ + VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | \ + VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | \ + VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | \ + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | \ + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | \ + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | \ + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | \ + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | \ + VK_PIPELINE_STAGE_TRANSFER_BIT | \ + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | \ + VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | \ + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT) + +void genX(CmdSetEvent)( + VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT, + pc.PostSyncOperation = WriteImmediateData, + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_SET; + } +} + +void genX(CmdResetEvent)( + VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_RESET; + } +} + +void genX(CmdWaitEvents)( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags destStageMask, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ +#if GEN_GEN >= 8 + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + for (uint32_t i = 0; i < eventCount; i++) { + ANV_FROM_HANDLE(anv_event, event, pEvents[i]); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) { + sem.WaitMode = PollingMode, + sem.CompareOperation = COMPARE_SAD_EQUAL_SDD, + sem.SemaphoreDataDword = VK_EVENT_SET, + sem.SemaphoreAddress = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + } + } +#else + anv_finishme("Implement events on gen7"); +#endif + + genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask, + false, /* byRegion */ + memoryBarrierCount, pMemoryBarriers, + bufferMemoryBarrierCount, pBufferMemoryBarriers, + imageMemoryBarrierCount, pImageMemoryBarriers); +} + +VkResult genX(CmdSetPerformanceOverrideINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceOverrideInfoINTEL* pOverrideInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + switch (pOverrideInfo->type) { + case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { + uint32_t dw; + +#if GEN_GEN >= 9 + anv_pack_struct(&dw, GENX(CS_DEBUG_MODE2), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2_num), dw); +#else + anv_pack_struct(&dw, GENX(INSTPM), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(INSTPM_num), dw); +#endif + break; + } + + case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: + if (pOverrideInfo->enable) { + /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_FLUSH_BITS | + ANV_PIPE_INVALIDATE_BITS; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + break; + + default: + unreachable("Invalid override"); + } + + return VK_SUCCESS; +} + +VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) +{ + /* TODO: Waiting on the register to write, might depend on generation. */ + + return VK_SUCCESS; +}