X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fintel%2Fvulkan%2FgenX_cmd_buffer.c;h=86ef1663ac456983c423a3243c20c50930c0ad18;hb=cefb4341b77a00d17bfe1f39ebdaec56d0632bfa;hp=8287d67a53d1a8a6ae921cbe15b8c26c5af08ffe;hpb=94675edcfda93dab29f2ed8a1f218d6491dbb085;p=mesa.git diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 8287d67a53d..86ef1663ac4 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -27,20 +27,17 @@ #include "anv_private.h" #include "vk_format_info.h" #include "vk_util.h" +#include "util/fast_idiv_by_const.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" -static void -emit_lrm(struct anv_batch *batch, - uint32_t reg, struct anv_bo *bo, uint32_t offset) -{ - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = reg; - lrm.MemoryAddress = (struct anv_address) { bo, offset }; - } -} +/* We reserve GPR 14 and 15 for conditional rendering */ +#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14 +#define __gen_get_batch_dwords anv_batch_emit_dwords +#define __gen_address_offset anv_address_add +#include "common/gen_mi_builder.h" static void emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) @@ -51,22 +48,16 @@ emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) } } -#if GEN_IS_HASWELL || GEN_GEN >= 8 -static void -emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) -{ - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { - lrr.SourceRegisterAddress = src; - lrr.DestinationRegisterAddress = dst; - } -} -#endif - void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; + /* If we are emitting a new state base address we probably need to re-emit + * binding tables. + */ + cmd_buffer->state.descriptors_dirty |= ~0; + /* Emit a render target cache flush. * * This isn't documented anywhere in the PRM. However, it seems to be @@ -82,26 +73,28 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.GeneralStateMemoryObjectControlState = GENX(MOCS); + sba.GeneralStateMOCS = GENX(MOCS); sba.GeneralStateBaseAddressModifyEnable = true; + sba.StatelessDataPortAccessMOCS = GENX(MOCS); + sba.SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer); - sba.SurfaceStateMemoryObjectControlState = GENX(MOCS); + sba.SurfaceStateMOCS = GENX(MOCS); sba.SurfaceStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddress = - (struct anv_address) { &device->dynamic_state_pool.block_pool.bo, 0 }; - sba.DynamicStateMemoryObjectControlState = GENX(MOCS); + (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 }; + sba.DynamicStateMOCS = GENX(MOCS); sba.DynamicStateBaseAddressModifyEnable = true; sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; - sba.IndirectObjectMemoryObjectControlState = GENX(MOCS); + sba.IndirectObjectMOCS = GENX(MOCS); sba.IndirectObjectBaseAddressModifyEnable = true; sba.InstructionBaseAddress = - (struct anv_address) { &device->instruction_state_pool.block_pool.bo, 0 }; - sba.InstructionMemoryObjectControlState = GENX(MOCS); + (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 }; + sba.InstructionMOCS = GENX(MOCS); sba.InstructionBaseAddressModifyEnable = true; # if (GEN_GEN >= 8) @@ -117,6 +110,43 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) sba.IndirectObjectBufferSizeModifyEnable = true; sba.InstructionBufferSize = 0xfffff; sba.InstructionBuffersizeModifyEnable = true; +# else + /* On gen7, we have upper bounds instead. According to the docs, + * setting an upper bound of zero means that no bounds checking is + * performed so, in theory, we should be able to leave them zero. + * However, border color is broken and the GPU bounds-checks anyway. + * To avoid this and other potential problems, we may as well set it + * for everything. + */ + sba.GeneralStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.GeneralStateAccessUpperBoundModifyEnable = true; + sba.DynamicStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.DynamicStateAccessUpperBoundModifyEnable = true; + sba.InstructionAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.InstructionAccessUpperBoundModifyEnable = true; +# endif +# if (GEN_GEN >= 9) + if (cmd_buffer->device->instance->physicalDevice.use_softpin) { + sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { + .bo = device->surface_state_pool.block_pool.bo, + .offset = 0, + }; + sba.BindlessSurfaceStateSize = (1 << 20) - 1; + } else { + sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS; + sba.BindlessSurfaceStateSize = 0; + } + sba.BindlessSurfaceStateMOCS = GENX(MOCS); + sba.BindlessSurfaceStateBaseAddressModifyEnable = true; +# endif +# if (GEN_GEN >= 10) + sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; + sba.BindlessSamplerStateMOCS = GENX(MOCS); + sba.BindlessSamplerStateBaseAddressModifyEnable = true; + sba.BindlessSamplerStateBufferSize = 0; # endif } @@ -165,38 +195,45 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) } static void -add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer, - struct anv_state state, - struct anv_bo *bo, uint32_t offset) +add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, + struct anv_state state, struct anv_address addr) { const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; VkResult result = anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, - state.offset + isl_dev->ss.addr_offset, bo, offset); + state.offset + isl_dev->ss.addr_offset, + addr.bo, addr.offset); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } static void -add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, - const struct anv_image_view *image_view, - const uint32_t plane, - struct anv_surface_state state) +add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, + struct anv_surface_state state) { const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; - const struct anv_image *image = image_view->image; - uint32_t image_plane = image_view->planes[plane].image_plane; - add_surface_state_reloc(cmd_buffer, state.state, - image->planes[image_plane].bo, state.address); + assert(!anv_address_is_null(state.address)); + add_surface_reloc(cmd_buffer, state.state, state.address); - if (state.aux_address) { + if (!anv_address_is_null(state.aux_address)) { VkResult result = anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.state.offset + isl_dev->ss.aux_addr_offset, - image->planes[image_plane].bo, state.aux_address); + state.aux_address.bo, state.aux_address.offset); + if (result != VK_SUCCESS) + anv_batch_set_error(&cmd_buffer->batch, result); + } + + if (!anv_address_is_null(state.clear_address)) { + VkResult result = + anv_reloc_list_add(&cmd_buffer->surface_relocs, + &cmd_buffer->pool->alloc, + state.state.offset + + isl_dev->ss.clear_color_state_offset, + state.clear_address.bo, state.clear_address.offset); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -209,7 +246,7 @@ color_attachment_compute_aux_usage(struct anv_device * device, union isl_color_value *fast_clear_color) { struct anv_attachment_state *att_state = &cmd_state->attachments[att]; - struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; + struct anv_image_view *iview = cmd_state->attachments[att].image_view; assert(iview->n_planes == 1); @@ -272,20 +309,8 @@ color_attachment_compute_aux_usage(struct anv_device * device, assert(iview->image->planes[0].aux_surface.isl.usage & (ISL_SURF_USAGE_CCS_BIT | ISL_SURF_USAGE_MCS_BIT)); - const struct isl_format_layout *view_fmtl = - isl_format_get_layout(iview->planes[0].isl.format); union isl_color_value clear_color = {}; - -#define COPY_CLEAR_COLOR_CHANNEL(c, i) \ - if (view_fmtl->channels.c.bits) \ - clear_color.u32[i] = att_state->clear_value.color.uint32[i] - - COPY_CLEAR_COLOR_CHANNEL(r, 0); - COPY_CLEAR_COLOR_CHANNEL(g, 1); - COPY_CLEAR_COLOR_CHANNEL(b, 2); - COPY_CLEAR_COLOR_CHANNEL(a, 3); - -#undef COPY_CLEAR_COLOR_CHANNEL + anv_clear_color_from_att_state(&clear_color, att_state, iview); att_state->clear_color_is_zero_one = isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format); @@ -363,7 +388,7 @@ depth_stencil_attachment_compute_aux_usage(struct anv_device *device, struct anv_render_pass_attachment *pass_att = &cmd_state->pass->attachments[att]; struct anv_attachment_state *att_state = &cmd_state->attachments[att]; - struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; + struct anv_image_view *iview = cmd_state->attachments[att].image_view; /* These will be initialized after the first subpass transition. */ att_state->aux_usage = ISL_AUX_USAGE_NONE; @@ -468,8 +493,58 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, 0, 0, 1, hiz_op); } -#define MI_PREDICATE_SRC0 0x2400 -#define MI_PREDICATE_SRC1 0x2408 +static inline bool +vk_image_layout_stencil_write_optimal(VkImageLayout layout) +{ + return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL; +} + +/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless + * the initial layout is undefined, the HiZ buffer and depth buffer will + * represent the same data at the end of this operation. + */ +static void +transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout) +{ +#if GEN_GEN == 7 + uint32_t plane = anv_image_aspect_to_plane(image->aspects, + VK_IMAGE_ASPECT_STENCIL_BIT); + + /* On gen7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in three + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the + * other two, we delay the copy until a transition into some other layout. + */ + if (image->planes[plane].shadow_surface.isl.size_B > 0 && + vk_image_layout_stencil_write_optimal(initial_layout) && + !vk_image_layout_stencil_write_optimal(final_layout)) { + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + base_level, level_count, + base_layer, layer_count); + } +#endif /* GEN_GEN == 7 */ +} + +#define MI_PREDICATE_SRC0 0x2400 +#define MI_PREDICATE_SRC1 0x2408 +#define MI_PREDICATE_RESULT 0x2418 static void set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, @@ -515,25 +590,6 @@ set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); } -#if GEN_IS_HASWELL || GEN_GEN >= 8 -static inline uint32_t -mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2) -{ - struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { - .ALUOpcode = opcode, - .Operand1 = operand1, - .Operand2 = operand2, - }; - - uint32_t dw; - GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); - - return dw; -} -#endif - -#define CS_GPR(n) (0x2600 + (n) * 8) - /* This is only really practical on haswell and above because it requires * MI math in order to get it correct. */ @@ -546,15 +602,12 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported) { - struct anv_address fast_clear_type_addr = - anv_image_get_fast_clear_type_addr(cmd_buffer->device, image, aspect); - - /* Name some registers */ - const int image_fc_reg = MI_ALU_REG0; - const int fc_imm_reg = MI_ALU_REG1; - const int pred_reg = MI_ALU_REG2; + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); - uint32_t *dw; + const struct gen_mi_value fast_clear_type = + gen_mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect)); if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) { /* In this case, we're doing a full resolve which means we want the @@ -565,17 +618,13 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, * if the first slice has been fast-cleared, it is also marked as * compressed. See also set_image_fast_clear_state. */ - struct anv_address compression_state_addr = - anv_image_get_compression_state_addr(cmd_buffer->device, image, - aspect, level, array_layer); - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = MI_PREDICATE_SRC0; - lrm.MemoryAddress = compression_state_addr; - } - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { - sdi.Address = compression_state_addr; - sdi.ImmediateData = 0; - } + const struct gen_mi_value compression_state = + gen_mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device, + image, aspect, + level, array_layer)); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + compression_state); + gen_mi_store(&b, compression_state, gen_mi_imm(0)); if (level == 0 && array_layer == 0) { /* If the predicate is true, we want to write 0 to the fast clear type @@ -583,25 +632,10 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, * * clear_type = clear_type & ~predicate; */ - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = CS_GPR(image_fc_reg); - lrm.MemoryAddress = fast_clear_type_addr; - } - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_REG), lrr) { - lrr.DestinationRegisterAddress = CS_GPR(pred_reg); - lrr.SourceRegisterAddress = MI_PREDICATE_SRC0; - } - - dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); - dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); - dw[3] = mi_alu(MI_ALU_AND, 0, 0); - dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); - - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.MemoryAddress = fast_clear_type_addr; - srm.RegisterAddress = CS_GPR(image_fc_reg); - } + struct gen_mi_value new_fast_clear_type = + gen_mi_iand(&b, fast_clear_type, + gen_mi_inot(&b, gen_mi_reg64(MI_PREDICATE_SRC0))); + gen_mi_store(&b, fast_clear_type, new_fast_clear_type); } } else if (level == 0 && array_layer == 0) { /* In this case, we are doing a partial resolve to get rid of fast-clear @@ -611,42 +645,20 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); assert(fast_clear_supported < ANV_FAST_CLEAR_ANY); - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = CS_GPR(image_fc_reg); - lrm.MemoryAddress = fast_clear_type_addr; - } - emit_lri(&cmd_buffer->batch, CS_GPR(image_fc_reg) + 4, 0); - - emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg), fast_clear_supported); - emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg) + 4, 0); - - /* We need to compute (fast_clear_supported < image->fast_clear). - * We do this by subtracting and storing the carry bit. - */ - dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, fc_imm_reg); - dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, image_fc_reg); - dw[3] = mi_alu(MI_ALU_SUB, 0, 0); - dw[4] = mi_alu(MI_ALU_STORE, pred_reg, MI_ALU_CF); - - /* Store the predicate */ - emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(pred_reg)); + /* We need to compute (fast_clear_supported < image->fast_clear) */ + struct gen_mi_value pred = + gen_mi_ult(&b, gen_mi_imm(fast_clear_supported), fast_clear_type); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + gen_mi_value_ref(&b, pred)); /* If the predicate is true, we want to write 0 to the fast clear type * and, if it's false, leave it alone. We can do this by writing * * clear_type = clear_type & ~predicate; */ - dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); - dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); - dw[3] = mi_alu(MI_ALU_AND, 0, 0); - dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); - - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.RegisterAddress = CS_GPR(image_fc_reg); - srm.MemoryAddress = fast_clear_type_addr; - } + struct gen_mi_value new_fast_clear_type = + gen_mi_iand(&b, fast_clear_type, gen_mi_inot(&b, pred)); + gen_mi_store(&b, fast_clear_type, new_fast_clear_type); } else { /* In this case, we're trying to do a partial resolve on a slice that * doesn't have clear color. There's nothing to do. @@ -655,13 +667,8 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, return; } - /* We use the first half of src0 for the actual predicate. Set the second - * half of src0 and all of src1 to 0 as the predicate operation will be - * doing an implicit src0 != src1. - */ - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); + /* Set src1 to 0 and use a != condition */ + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOADINV; @@ -680,8 +687,12 @@ anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported) { - struct anv_address fast_clear_type_addr = - anv_image_get_fast_clear_type_addr(cmd_buffer->device, image, aspect); + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + struct gen_mi_value fast_clear_type_mem = + gen_mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect)); /* This only works for partial resolves and only when the clear color is * all or nothing. On the upside, this emits less command streamer code @@ -698,22 +709,9 @@ anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, * can't sample from CCS surfaces. It's enough to just load the fast clear * state into the predicate register. */ - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = MI_PREDICATE_SRC0; - lrm.MemoryAddress = fast_clear_type_addr; - } - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { - sdi.Address = fast_clear_type_addr; - sdi.ImmediateData = 0; - } - - /* We use the first half of src0 for the actual predicate. Set the second - * half of src0 and all of src1 to 0 as the predicate operation will be - * doing an implicit src0 != src1. - */ - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); + gen_mi_store(&b, fast_clear_type_mem, gen_mi_imm(0)); anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOADINV; @@ -726,6 +724,7 @@ anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, static void anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, + enum isl_format format, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, @@ -750,13 +749,14 @@ anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) resolve_op = ISL_AUX_OP_FULL_RESOLVE; - anv_image_ccs_op(cmd_buffer, image, aspect, level, - array_layer, 1, resolve_op, true); + anv_image_ccs_op(cmd_buffer, image, format, aspect, level, + array_layer, 1, resolve_op, NULL, true); } static void anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, + enum isl_format format, VkImageAspectFlagBits aspect, uint32_t array_layer, enum isl_aux_op resolve_op, @@ -770,8 +770,8 @@ anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, aspect, 0, array_layer, resolve_op, fast_clear_supported); - anv_image_mcs_op(cmd_buffer, image, aspect, - array_layer, 1, resolve_op, true); + anv_image_mcs_op(cmd_buffer, image, format, aspect, + array_layer, 1, resolve_op, NULL, true); #else unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail"); #endif @@ -787,7 +787,7 @@ genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, uint32_t layer_count) { /* The aspect must be exactly one of the image aspects. */ - assert(_mesa_bitcount(aspect) == 1 && (aspect & image->aspects)); + assert(util_bitcount(aspect) == 1 && (aspect & image->aspects)); /* The only compression types with more than just fast-clears are MCS, * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually @@ -813,27 +813,21 @@ init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, set_image_fast_clear_state(cmd_buffer, image, aspect, ANV_FAST_CLEAR_NONE); - /* The fast clear value dword(s) will be copied into a surface state object. - * Ensure that the restrictions of the fields in the dword(s) are followed. - * - * CCS buffers on SKL+ can have any value set for the clear colors. - */ - if (image->samples == 1 && GEN_GEN >= 9) - return; - - /* Other combinations of auxiliary buffers and platforms require specific - * values in the clear value dword(s). + /* Initialize the struct fields that are accessed for fast-clears so that + * the HW restrictions on the field values are satisfied. */ struct anv_address addr = anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); if (GEN_GEN >= 9) { - for (unsigned i = 0; i < 4; i++) { + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + const unsigned num_dwords = GEN_GEN >= 10 ? + isl_dev->ss.clear_color_state_size / 4 : + isl_dev->ss.clear_value_size / 4; + for (unsigned i = 0; i < num_dwords; i++) { anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = addr; sdi.Address.offset += i * 4; - /* MCS buffers on SKL+ can only have 1/0 clear colors. */ - assert(image->samples > 1); sdi.ImmediateData = 0; } } @@ -872,20 +866,44 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, assert(cmd_buffer && image); assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - struct anv_bo *ss_bo = - &cmd_buffer->device->surface_state_pool.block_pool.bo; - uint32_t ss_clear_offset = surface_state.offset + - cmd_buffer->device->isl_dev.ss.clear_value_offset; + struct anv_address ss_clear_addr = { + .bo = cmd_buffer->device->surface_state_pool.block_pool.bo, + .offset = surface_state.offset + + cmd_buffer->device->isl_dev.ss.clear_value_offset, + }; const struct anv_address entry_addr = anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; +#if GEN_GEN == 7 + /* On gen7, the combination of commands used here(MI_LOAD_REGISTER_MEM + * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is + * in-flight when they are issued even if the memory touched is not + * currently active for rendering. The weird bit is that it is not the + * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight + * rendering hangs such that the next stalling command after the + * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang. + * + * It is unclear exactly why this hang occurs. Both MI commands come with + * warnings about the 3D pipeline but that doesn't seem to fully explain + * it. My (Jason's) best theory is that it has something to do with the + * fact that we're using a GPU state register as our temporary and that + * something with reading/writing it is causing problems. + * + * In order to work around this issue, we emit a PIPE_CONTROL with the + * command streamer stall bit set. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +#endif + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + if (copy_from_surface_state) { - genX(cmd_buffer_mi_memcpy)(cmd_buffer, entry_addr.bo, entry_addr.offset, - ss_bo, ss_clear_offset, copy_size); + gen_mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size); } else { - genX(cmd_buffer_mi_memcpy)(cmd_buffer, ss_bo, ss_clear_offset, - entry_addr.bo, entry_addr.offset, copy_size); + gen_mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size); /* Updating a surface state object may require that the state cache be * invalidated. From the SKL PRM, Shared Functions -> State -> State @@ -931,7 +949,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, assert(level_count != VK_REMAINING_MIP_LEVELS && layer_count != VK_REMAINING_ARRAY_LAYERS); /* Ensure the subresource range is valid. */ - uint64_t last_level_num = base_level + level_count; + UNUSED uint64_t last_level_num = base_level + level_count; const uint32_t max_depth = anv_minify(image->extent.depth, base_level); UNUSED const uint32_t image_layers = MAX2(image->array_size, max_depth); assert((uint64_t)base_layer + layer_count <= image_layers); @@ -948,7 +966,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - if (image->planes[plane].shadow_surface.isl.size > 0 && + if (image->planes[plane].shadow_surface.isl.size_B > 0 && final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { /* This surface is a linear compressed image with a tiled shadow surface * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so @@ -960,6 +978,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, assert(isl_format_is_compressed(image->planes[plane].surface.isl.format)); assert(plane == 0); anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_COLOR_BIT, base_level, level_count, base_layer, layer_count); } @@ -1025,9 +1044,10 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, uint32_t level_layer_count = MIN2(layer_count, aux_layers - base_layer); - anv_image_ccs_op(cmd_buffer, image, aspect, level, - base_layer, level_layer_count, - ISL_AUX_OP_AMBIGUATE, false); + anv_image_ccs_op(cmd_buffer, image, + image->planes[plane].surface.isl.format, + aspect, level, base_layer, level_layer_count, + ISL_AUX_OP_AMBIGUATE, NULL, false); if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { set_image_compressed_bit(cmd_buffer, image, aspect, @@ -1043,9 +1063,10 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, } assert(base_level == 0 && level_count == 1); - anv_image_mcs_op(cmd_buffer, image, aspect, - base_layer, layer_count, - ISL_AUX_OP_FAST_CLEAR, false); + anv_image_mcs_op(cmd_buffer, image, + image->planes[plane].surface.isl.format, + aspect, base_layer, layer_count, + ISL_AUX_OP_FAST_CLEAR, NULL, false); } return; } @@ -1121,12 +1142,22 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, for (uint32_t a = 0; a < level_layer_count; a++) { uint32_t array_layer = base_layer + a; if (image->samples == 1) { - anv_cmd_predicated_ccs_resolve(cmd_buffer, image, aspect, - level, array_layer, resolve_op, + anv_cmd_predicated_ccs_resolve(cmd_buffer, image, + image->planes[plane].surface.isl.format, + aspect, level, array_layer, resolve_op, final_fast_clear); } else { - anv_cmd_predicated_mcs_resolve(cmd_buffer, image, aspect, - array_layer, resolve_op, + /* We only support fast-clear on the first layer so partial + * resolves should not be used on other layers as they will use + * the clear color stored in memory that is only valid for layer0. + */ + if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && + array_layer != 0) + continue; + + anv_cmd_predicated_mcs_resolve(cmd_buffer, image, + image->planes[plane].surface.isl.format, + aspect, array_layer, resolve_op, final_fast_clear); } } @@ -1146,6 +1177,7 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, { const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; struct anv_cmd_state *state = &cmd_buffer->state; + struct anv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; vk_free(&cmd_buffer->pool->alloc, state->attachments); @@ -1185,6 +1217,12 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, next_state.offset += ss_stride; next_state.map += ss_stride; + const VkRenderPassAttachmentBeginInfoKHR *begin_attachment = + vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR); + + if (begin && !begin_attachment) + assert(pass->attachment_count == framebuffer->attachment_count); + for (uint32_t i = 0; i < pass->attachment_count; ++i) { if (vk_format_is_color(pass->attachments[i].format)) { state->attachments[i].color.state = next_state; @@ -1197,14 +1235,19 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, next_state.offset += ss_stride; next_state.map += ss_stride; } + + if (begin_attachment && begin_attachment->attachmentCount != 0) { + assert(begin_attachment->attachmentCount == pass->attachment_count); + ANV_FROM_HANDLE(anv_image_view, iview, begin_attachment->pAttachments[i]); + cmd_buffer->state.attachments[i].image_view = iview; + } else if (framebuffer && i < framebuffer->attachment_count) { + cmd_buffer->state.attachments[i].image_view = framebuffer->attachments[i]; + } } assert(next_state.offset == state->render_pass_states.offset + state->render_pass_states.alloc_size); if (begin) { - ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer); - assert(pass->attachment_count == framebuffer->attachment_count); - isl_null_fill_state(isl_dev, state->null_surface_state.map, isl_extent3d(framebuffer->width, framebuffer->height, @@ -1247,15 +1290,15 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, if (clear_aspects) state->attachments[i].clear_value = begin->pClearValues[i]; - struct anv_image_view *iview = framebuffer->attachments[i]; + struct anv_image_view *iview = cmd_buffer->state.attachments[i].image_view; anv_assert(iview->vk_format == att->format); - anv_assert(iview->n_planes == 1); const uint32_t num_layers = iview->planes[0].isl.array_len; state->attachments[i].pending_clear_views = (1 << num_layers) - 1; union isl_color_value clear_color = { .u32 = { 0, } }; if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + anv_assert(iview->n_planes == 1); assert(att_aspects == VK_IMAGE_ASPECT_COLOR_BIT); color_attachment_compute_aux_usage(cmd_buffer->device, state, i, begin->renderArea, @@ -1272,8 +1315,7 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, &state->attachments[i].color, NULL); - add_image_view_relocs(cmd_buffer, iview, 0, - state->attachments[i].color); + add_surface_state_relocs(cmd_buffer, state->attachments[i].color); } else { depth_stencil_attachment_compute_aux_usage(cmd_buffer->device, state, i, @@ -1292,8 +1334,7 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, &state->attachments[i].input, NULL); - add_image_view_relocs(cmd_buffer, iview, 0, - state->attachments[i].input); + add_surface_state_relocs(cmd_buffer, state->attachments[i].input); } } } @@ -1374,7 +1415,7 @@ genX(BeginCommandBuffer)( if (iview) { VkImageLayout layout = - cmd_buffer->state.subpass->depth_stencil_attachment.layout; + cmd_buffer->state.subpass->depth_stencil_attachment->layout; enum isl_aux_usage aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, @@ -1387,6 +1428,19 @@ genX(BeginCommandBuffer)( cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; } +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info = + vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT); + + /* If secondary buffer supports conditional rendering + * we should emit commands as if conditional rendering is enabled. + */ + cmd_buffer->state.conditional_render_enabled = + conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable; + } +#endif + return result; } @@ -1421,10 +1475,18 @@ genX(BeginCommandBuffer)( * context restore, so the mentioned hang doesn't happen. However, * software must program push constant commands for all stages prior to * rendering anything. So we flag them dirty in BeginCommandBuffer. + * + * Finally, we also make sure to stall at pixel scoreboard to make sure the + * constants have been loaded into the EUs prior to disable the push constants + * so that it doesn't hang a previous 3DPRIMITIVE. */ static void emit_isp_disable(struct anv_cmd_buffer *cmd_buffer) { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.IndirectStatePointersDisable = true; pc.CommandStreamerStallEnable = true; @@ -1483,6 +1545,21 @@ genX(CmdExecuteCommands)( assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); assert(!anv_batch_has_error(&secondary->batch)); +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (secondary->state.conditional_render_enabled) { + if (!primary->state.conditional_render_enabled) { + /* Secondary buffer is constructed as if it will be executed + * with conditional rendering, we should satisfy this dependency + * regardless of conditional rendering being enabled in primary. + */ + struct gen_mi_builder b; + gen_mi_builder_init(&b, &primary->batch); + gen_mi_store(&b, gen_mi_reg64(ANV_PREDICATE_RESULT_REG), + gen_mi_imm(UINT64_MAX)); + } + } +#endif + if (secondary->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { /* If we're continuing a render pass from the primary, we need to @@ -1490,13 +1567,20 @@ genX(CmdExecuteCommands)( * we allocated for them in BeginCommandBuffer. */ struct anv_bo *ss_bo = - &primary->device->surface_state_pool.block_pool.bo; + primary->device->surface_state_pool.block_pool.bo; struct anv_state src_state = primary->state.render_pass_states; struct anv_state dst_state = secondary->state.render_pass_states; assert(src_state.alloc_size == dst_state.alloc_size); - genX(cmd_buffer_so_memcpy)(primary, ss_bo, dst_state.offset, - ss_bo, src_state.offset, + genX(cmd_buffer_so_memcpy)(primary, + (struct anv_address) { + .bo = ss_bo, + .offset = dst_state.offset, + }, + (struct anv_address) { + .bo = ss_bo, + .offset = src_state.offset, + }, src_state.alloc_size); } @@ -1592,6 +1676,14 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, uint32_t l3cr; anv_pack_struct(&l3cr, GENX(L3CNTLREG), .SLMEnable = has_slm, +#if GEN_GEN == 11 + /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set + * in L3CNTLREG register. The default setting of the bit is not the + * desirable behavior. + */ + .ErrorDetectionBehaviorControl = true, + .UseFullWays = true, +#endif .URBAllocation = cfg->n[GEN_L3P_URB], .ROAllocation = cfg->n[GEN_L3P_RO], .DCAllocation = cfg->n[GEN_L3P_DC], @@ -1622,7 +1714,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]); /* Minimum number of ways that can be allocated to the URB. */ - MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; + const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; assert(cfg->n[GEN_L3P_URB] >= n0_urb); uint32_t l3sqcr1, l3cr2, l3cr3; @@ -1733,10 +1825,29 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) pipe.StallAtPixelScoreboard = true; } + /* If a render target flush was emitted, then we can toggle off the bit + * saying that render target writes are ongoing. + */ + if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) + bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES); + bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT); } if (bits & ANV_PIPE_INVALIDATE_BITS) { + /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", + * + * "If the VF Cache Invalidation Enable is set to a 1 in a + * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to + * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent + * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to + * a 1." + * + * This appears to hang Broadwell, so we restrict it to just gen9. + */ + if (GEN_GEN == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe); + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.StateCacheInvalidationEnable = bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; @@ -1748,6 +1859,18 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; pipe.InstructionCacheInvalidateEnable = bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT; + + /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", + * + * "When VF Cache Invalidate is set “Post Sync Operation” must be + * enabled to “Write Immediate Data” or “Write PS Depth Count” or + * “Write Timestamp”. + */ + if (GEN_GEN == 9 && pipe.VFCacheInvalidationEnable) { + pipe.PostSyncOperation = WriteImmediateData; + pipe.Address = + (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; + } } bits &= ~ANV_PIPE_INVALIDATE_BITS; @@ -1794,24 +1917,34 @@ void genX(CmdPipelineBarrier)( const VkImageSubresourceRange *range = &pImageMemoryBarriers[i].subresourceRange; + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->extent.depth, range->baseMipLevel); + } else { + base_layer = range->baseArrayLayer; + layer_count = anv_get_layerCount(image, range); + } + if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, pImageMemoryBarriers[i].oldLayout, pImageMemoryBarriers[i].newLayout); - } else if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + } + + if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + range->baseMipLevel, + anv_get_levelCount(image, range), + base_layer, layer_count, + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout); + } + + if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { VkImageAspectFlags color_aspects = anv_image_expand_aspects(image, range->aspectMask); uint32_t aspect_bit; - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(image->extent.depth, range->baseMipLevel); - } else { - base_layer = range->baseArrayLayer; - layer_count = anv_get_layerCount(image, range); - } - anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, range->baseMipLevel, @@ -1853,7 +1986,7 @@ cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) #endif const unsigned num_stages = - _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS); + util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS); unsigned size_per_stage = push_constant_kb / num_stages; /* Broadwell+ and Haswell gt3 require that the push constant sizes be in @@ -1923,6 +2056,31 @@ dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, return pipe_state->dynamic_offsets[dynamic_offset_idx]; } +static struct anv_address +anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, + struct anv_descriptor_set *set) +{ + if (set->pool) { + /* This is a normal descriptor set */ + return (struct anv_address) { + .bo = &set->pool->bo, + .offset = set->desc_mem.offset, + }; + } else { + /* This is a push descriptor set. We have to flag it as used on the GPU + * so that the next time we push descriptors, we grab a new memory. + */ + struct anv_push_descriptor_set *push_set = + (struct anv_push_descriptor_set *)set; + push_set->set_used_on_gpu = true; + + return (struct anv_address) { + .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo, + .offset = set->desc_mem.offset, + }; + } +} + static VkResult emit_binding_table(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage, @@ -1931,16 +2089,14 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_subpass *subpass = cmd_buffer->state.subpass; struct anv_cmd_pipeline_state *pipe_state; struct anv_pipeline *pipeline; - uint32_t bias, state_offset; + uint32_t state_offset; switch (stage) { case MESA_SHADER_COMPUTE: pipe_state = &cmd_buffer->state.compute.base; - bias = 1; break; default: pipe_state = &cmd_buffer->state.gfx.base; - bias = 0; break; } pipeline = pipe_state->pipeline; @@ -1951,50 +2107,25 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, } struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; - if (bias + map->surface_count == 0) { + if (map->surface_count == 0) { *bt_state = (struct anv_state) { 0, }; return VK_SUCCESS; } *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, - bias + map->surface_count, + map->surface_count, &state_offset); uint32_t *bt_map = bt_state->map; if (bt_state->map == NULL) return VK_ERROR_OUT_OF_DEVICE_MEMORY; - if (stage == MESA_SHADER_COMPUTE && - get_cs_prog_data(pipeline)->uses_num_work_groups) { - struct anv_bo *bo = cmd_buffer->state.compute.num_workgroups.bo; - uint32_t bo_offset = cmd_buffer->state.compute.num_workgroups.offset; - - struct anv_state surface_state; - surface_state = - anv_cmd_buffer_alloc_surface_state(cmd_buffer); - - const enum isl_format format = - anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, - format, bo_offset, 12, 1); - - bt_map[0] = surface_state.offset + state_offset; - add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset); - } - - if (map->surface_count == 0) - goto out; - - if (map->image_count > 0) { - VkResult result = - anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images); - if (result != VK_SUCCESS) - return result; - - cmd_buffer->state.push_constants_dirty |= 1 << stage; - } + /* We only need to emit relocs if we're not using softpin. If we are using + * softpin then we always keep all user-allocated memory objects resident. + */ + const bool need_client_mem_relocs = + !cmd_buffer->device->instance->physicalDevice.use_softpin; - uint32_t image = 0; for (uint32_t s = 0; s < map->surface_count; s++) { struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; @@ -2023,7 +2154,60 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, surface_state = cmd_buffer->state.null_surface_state; } - bt_map[bias + s] = surface_state.offset + state_offset; + bt_map[s] = surface_state.offset + state_offset; + continue; + } else if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { + struct anv_state surface_state = + anv_cmd_buffer_alloc_surface_state(cmd_buffer); + + struct anv_address constant_data = { + .bo = pipeline->device->dynamic_state_pool.block_pool.bo, + .offset = pipeline->shaders[stage]->constant_data.offset, + }; + unsigned constant_data_size = + pipeline->shaders[stage]->constant_data_size; + + const enum isl_format format = + anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + anv_fill_buffer_surface_state(cmd_buffer->device, + surface_state, format, + constant_data, constant_data_size, 1); + + bt_map[s] = surface_state.offset + state_offset; + add_surface_reloc(cmd_buffer, surface_state, constant_data); + continue; + } else if (binding->set == ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS) { + /* This is always the first binding for compute shaders */ + assert(stage == MESA_SHADER_COMPUTE && s == 0); + if (!get_cs_prog_data(pipeline)->uses_num_work_groups) + continue; + + struct anv_state surface_state = + anv_cmd_buffer_alloc_surface_state(cmd_buffer); + + const enum isl_format format = + anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, + cmd_buffer->state.compute.num_workgroups, + 12, 1); + bt_map[s] = surface_state.offset + state_offset; + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + cmd_buffer->state.compute.num_workgroups); + } + continue; + } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + /* This is a descriptor set buffer so the set index is actually + * given by binding->binding. (Yes, that's confusing.) + */ + struct anv_descriptor_set *set = + pipe_state->descriptors[binding->binding]; + assert(set->desc_mem.alloc_size); + assert(set->desc_surface_state.alloc_size); + bt_map[s] = set->desc_surface_state.offset + state_offset; + add_surface_reloc(cmd_buffer, set->desc_surface_state, + anv_descriptor_set_address(cmd_buffer, set)); continue; } @@ -2043,8 +2227,8 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, desc->image_view->planes[binding->plane].optimal_sampler_surface_state; surface_state = sstate.state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - binding->plane, sstate); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); break; } case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: @@ -2059,8 +2243,8 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, desc->image_view->planes[binding->plane].optimal_sampler_surface_state; surface_state = sstate.state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - binding->plane, sstate); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); } else { /* For color input attachments, we create the surface state at * vkBeginRenderPass time so that we can include aux and clear @@ -2079,14 +2263,8 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, : desc->image_view->planes[binding->plane].storage_surface_state; surface_state = sstate.state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - binding->plane, sstate); - - struct brw_image_param *image_param = - &cmd_buffer->state.push_constants[stage]->images[image++]; - - *image_param = desc->image_view->planes[binding->plane].storage_image_param; - image_param->surface_idx = bias + s; + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); break; } @@ -2095,9 +2273,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: surface_state = desc->buffer_view->surface_state; assert(surface_state.alloc_size); - add_surface_state_reloc(cmd_buffer, surface_state, - desc->buffer_view->bo, - desc->buffer_view->offset); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: @@ -2111,16 +2290,18 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, /* Clamp the range to the buffer size */ uint32_t range = MIN2(desc->range, desc->buffer->size - offset); + struct anv_address address = + anv_address_add(desc->buffer->address, offset); + surface_state = anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); enum isl_format format = anv_isl_format_for_descriptor_type(desc->type); anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, - format, offset, range, 1); - add_surface_state_reloc(cmd_buffer, surface_state, - desc->buffer->bo, - desc->buffer->offset + offset); + format, address, range, 1); + if (need_client_mem_relocs) + add_surface_reloc(cmd_buffer, surface_state, address); break; } @@ -2129,15 +2310,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, ? desc->buffer_view->writeonly_storage_surface_state : desc->buffer_view->storage_surface_state; assert(surface_state.alloc_size); - add_surface_state_reloc(cmd_buffer, surface_state, - desc->buffer_view->bo, - desc->buffer_view->offset); - - struct brw_image_param *image_param = - &cmd_buffer->state.push_constants[stage]->images[image++]; - - *image_param = desc->buffer_view->storage_image_param; - image_param->surface_idx = bias + s; + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } break; default: @@ -2145,12 +2321,8 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, continue; } - bt_map[bias + s] = surface_state.offset + state_offset; + bt_map[s] = surface_state.offset + state_offset; } - assert(image == map->image_count); - - out: - anv_state_flush(cmd_buffer->device, *bt_state); #if GEN_GEN >= 11 /* The PIPE_CONTROL command description says: @@ -2223,8 +2395,6 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, sampler->state[binding->plane], sizeof(sampler->state[0])); } - anv_state_flush(cmd_buffer->device, *state); - return VK_SUCCESS; } @@ -2382,36 +2552,59 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, const struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[surface]; - const struct anv_descriptor *desc = - anv_descriptor_for_binding(&gfx_state->base, binding); - struct anv_address read_addr; uint32_t read_len; - if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { - read_len = MIN2(range->length, - DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start); - read_addr = (struct anv_address) { - .bo = desc->buffer_view->bo, - .offset = desc->buffer_view->offset + - range->start * 32, + if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { + struct anv_address constant_data = { + .bo = pipeline->device->dynamic_state_pool.block_pool.bo, + .offset = pipeline->shaders[stage]->constant_data.offset, }; - } else { - assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + unsigned constant_data_size = + pipeline->shaders[stage]->constant_data_size; - uint32_t dynamic_offset = - dynamic_offset_for_binding(&gfx_state->base, binding); - uint32_t buf_offset = - MIN2(desc->offset + dynamic_offset, desc->buffer->size); - uint32_t buf_range = - MIN2(desc->range, desc->buffer->size - buf_offset); + read_len = MIN2(range->length, + DIV_ROUND_UP(constant_data_size, 32) - range->start); + read_addr = anv_address_add(constant_data, + range->start * 32); + } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + /* This is a descriptor set buffer so the set index is + * actually given by binding->binding. (Yes, that's + * confusing.) + */ + struct anv_descriptor_set *set = + gfx_state->base.descriptors[binding->binding]; + struct anv_address desc_buffer_addr = + anv_descriptor_set_address(cmd_buffer, set); + const unsigned desc_buffer_size = set->desc_mem.alloc_size; read_len = MIN2(range->length, - DIV_ROUND_UP(buf_range, 32) - range->start); - read_addr = (struct anv_address) { - .bo = desc->buffer->bo, - .offset = desc->buffer->offset + buf_offset + - range->start * 32, - }; + DIV_ROUND_UP(desc_buffer_size, 32) - range->start); + read_addr = anv_address_add(desc_buffer_addr, + range->start * 32); + } else { + const struct anv_descriptor *desc = + anv_descriptor_for_binding(&gfx_state->base, binding); + + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + read_len = MIN2(range->length, + DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start); + read_addr = anv_address_add(desc->buffer_view->address, + range->start * 32); + } else { + assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + + uint32_t dynamic_offset = + dynamic_offset_for_binding(&gfx_state->base, binding); + uint32_t buf_offset = + MIN2(desc->offset + dynamic_offset, desc->buffer->size); + uint32_t buf_range = + MIN2(desc->range, desc->buffer->size - buf_offset); + + read_len = MIN2(range->length, + DIV_ROUND_UP(buf_range, 32) - range->start); + read_addr = anv_address_add(desc->buffer->address, + buf_offset + range->start * 32); + } } if (read_len > 0) { @@ -2426,7 +2619,7 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, if (state.alloc_size > 0) { c.ConstantBody.Buffer[n] = (struct anv_address) { - .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, .offset = state.offset, }; c.ConstantBody.ReadLength[n] = @@ -2463,6 +2656,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) uint32_t *p; uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) + vb_emit |= pipeline->vb_used; assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); @@ -2484,27 +2679,20 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) struct GENX(VERTEX_BUFFER_STATE) state = { .VertexBufferIndex = vb, -#if GEN_GEN >= 8 - .MemoryObjectControlState = GENX(MOCS), -#else - .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA, - /* Our implementation of VK_KHR_multiview uses instancing to draw - * the different views. If the client asks for instancing, we - * need to use the Instance Data Step Rate to ensure that we - * repeat the client's per-instance data once for each view. - */ - .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass), - .VertexBufferMemoryObjectControlState = GENX(MOCS), + .MOCS = anv_mocs_for_bo(cmd_buffer->device, buffer->address.bo), +#if GEN_GEN <= 7 + .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA, + .InstanceDataStepRate = pipeline->vb[vb].instance_divisor, #endif .AddressModifyEnable = true, - .BufferPitch = pipeline->binding_stride[vb], - .BufferStartingAddress = { buffer->bo, buffer->offset + offset }, + .BufferPitch = pipeline->vb[vb].stride, + .BufferStartingAddress = anv_address_add(buffer->address, offset), #if GEN_GEN >= 8 .BufferSize = buffer->size - offset #else - .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1}, + .EndAddress = anv_address_add(buffer->address, buffer->size - 1), #endif }; @@ -2515,6 +2703,34 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; +#if GEN_GEN >= 8 + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) { + /* We don't need any per-buffer dirty tracking because you're not + * allowed to bind different XFB buffers while XFB is enabled. + */ + for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { + sob.SOBufferIndex = idx; + + if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { + sob.SOBufferEnable = true; + sob.MOCS = cmd_buffer->device->default_mocs, + sob.StreamOffsetWriteEnable = false; + sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, + xfb->offset); + /* Size is in DWords - 1 */ + sob.SurfaceSize = xfb->size / 4 - 1; + } + } + } + + /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ + if (GEN_GEN >= 10) + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + } +#endif + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); @@ -2587,7 +2803,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) pipeline->depth_clamp_enable); } - if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR) + if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR | + ANV_CMD_DIRTY_RENDER_TARGETS)) gen7_cmd_buffer_emit_scissor(cmd_buffer); genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); @@ -2597,7 +2814,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) static void emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, - struct anv_bo *bo, uint32_t offset, + struct anv_address addr, uint32_t size, uint32_t index) { uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5, @@ -2608,23 +2825,22 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, .VertexBufferIndex = index, .AddressModifyEnable = true, .BufferPitch = 0, + .MOCS = anv_mocs_for_bo(cmd_buffer->device, addr.bo), #if (GEN_GEN >= 8) - .MemoryObjectControlState = GENX(MOCS), - .BufferStartingAddress = { bo, offset }, + .BufferStartingAddress = addr, .BufferSize = size #else - .VertexBufferMemoryObjectControlState = GENX(MOCS), - .BufferStartingAddress = { bo, offset }, - .EndAddress = { bo, offset + size }, + .BufferStartingAddress = addr, + .EndAddress = anv_address_add(addr, size), #endif }); } static void emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, - struct anv_bo *bo, uint32_t offset) + struct anv_address addr) { - emit_vertex_bo(cmd_buffer, bo, offset, 8, ANV_SVGS_VB_INDEX); + emit_vertex_bo(cmd_buffer, addr, 8, ANV_SVGS_VB_INDEX); } static void @@ -2637,10 +2853,12 @@ emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, ((uint32_t *)id_state.map)[0] = base_vertex; ((uint32_t *)id_state.map)[1] = base_instance; - anv_state_flush(cmd_buffer->device, id_state); + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = id_state.offset, + }; - emit_base_vertex_instance_bo(cmd_buffer, - &cmd_buffer->device->dynamic_state_pool.block_pool.bo, id_state.offset); + emit_base_vertex_instance_bo(cmd_buffer, addr); } static void @@ -2651,11 +2869,12 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) ((uint32_t *)state.map)[0] = draw_index; - anv_state_flush(cmd_buffer->device, state); + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = state.offset, + }; - emit_vertex_bo(cmd_buffer, - &cmd_buffer->device->dynamic_state_pool.block_pool.bo, - state.offset, 4, ANV_DRAWID_VB_INDEX); + emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); } void genX(CmdDraw)( @@ -2674,7 +2893,11 @@ void genX(CmdDraw)( genX(cmd_buffer_flush_state)(cmd_buffer); - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); @@ -2685,6 +2908,7 @@ void genX(CmdDraw)( instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; prim.VertexCountPerInstance = vertexCount; @@ -2712,7 +2936,11 @@ void genX(CmdDrawIndexed)( genX(cmd_buffer_flush_state)(cmd_buffer); - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance); if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); @@ -2723,6 +2951,7 @@ void genX(CmdDrawIndexed)( instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = RANDOM; prim.PrimitiveTopologyType = pipeline->topology; prim.VertexCountPerInstance = indexCount; @@ -2741,92 +2970,101 @@ void genX(CmdDrawIndexed)( #define GEN7_3DPRIM_START_INSTANCE 0x243C #define GEN7_3DPRIM_BASE_VERTEX 0x2440 -/* MI_MATH only exists on Haswell+ */ -#if GEN_IS_HASWELL || GEN_GEN >= 8 - -/* Emit dwords to multiply GPR0 by N */ -static void -build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) +void genX(CmdDrawIndirectByteCountEXT)( + VkCommandBuffer commandBuffer, + uint32_t instanceCount, + uint32_t firstInstance, + VkBuffer counterBuffer, + VkDeviceSize counterBufferOffset, + uint32_t counterOffset, + uint32_t vertexStride) { - VK_OUTARRAY_MAKE(out, dw, dw_count); - -#define append_alu(opcode, operand1, operand2) \ - vk_outarray_append(&out, alu_dw) *alu_dw = mi_alu(opcode, operand1, operand2) - - assert(N > 0); - unsigned top_bit = 31 - __builtin_clz(N); - for (int i = top_bit - 1; i >= 0; i--) { - /* We get our initial data in GPR0 and we write the final data out to - * GPR0 but we use GPR1 as our scratch register. - */ - unsigned src_reg = i == top_bit - 1 ? MI_ALU_REG0 : MI_ALU_REG1; - unsigned dst_reg = i == 0 ? MI_ALU_REG0 : MI_ALU_REG1; +#if GEN_IS_HASWELL || GEN_GEN >= 8 + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer); + struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); - /* Shift the current value left by 1 */ - append_alu(MI_ALU_LOAD, MI_ALU_SRCA, src_reg); - append_alu(MI_ALU_LOAD, MI_ALU_SRCB, src_reg); - append_alu(MI_ALU_ADD, 0, 0); + /* firstVertex is always zero for this draw function */ + const uint32_t firstVertex = 0; - if (N & (1 << i)) { - /* Store ACCU to R1 and add R0 to R1 */ - append_alu(MI_ALU_STORE, MI_ALU_REG1, MI_ALU_ACCU); - append_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); - append_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1); - append_alu(MI_ALU_ADD, 0, 0); - } + if (anv_batch_has_error(&cmd_buffer->batch)) + return; - append_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU); - } + genX(cmd_buffer_flush_state)(cmd_buffer); -#undef append_alu -} + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); -static void -emit_mul_gpr0(struct anv_batch *batch, uint32_t N) -{ - uint32_t num_dwords; - build_alu_multiply_gpr0(NULL, &num_dwords, N); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); - uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH)); - build_alu_multiply_gpr0(dw + 1, &num_dwords, N); -} + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + struct gen_mi_value count = + gen_mi_mem32(anv_address_add(counter_buffer->address, + counterBufferOffset)); + if (counterOffset) + count = gen_mi_isub(&b, count, gen_mi_imm(counterOffset)); + count = gen_mi_udiv32_imm(&b, count, vertexStride); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_VERTEX_COUNT), count); + + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_VERTEX), + gen_mi_imm(firstVertex)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_INSTANCE_COUNT), + gen_mi_imm(instanceCount)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_INSTANCE), + gen_mi_imm(firstInstance)); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_BASE_VERTEX), gen_mi_imm(0)); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = pipeline->topology; + } #endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ +} static void load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, - struct anv_buffer *buffer, uint64_t offset, + struct anv_address addr, bool indexed) { - struct anv_batch *batch = &cmd_buffer->batch; - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); - emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_VERTEX_COUNT), + gen_mi_mem32(anv_address_add(addr, 0))); + struct gen_mi_value instance_count = gen_mi_mem32(anv_address_add(addr, 4)); unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass); if (view_count > 1) { #if GEN_IS_HASWELL || GEN_GEN >= 8 - emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); - emit_mul_gpr0(batch, view_count); - emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); + instance_count = gen_mi_imul_imm(&b, instance_count, view_count); #else anv_finishme("Multiview + indirect draw requires MI_MATH; " "MI_MATH is not supported on Ivy Bridge"); - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); #endif - } else { - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); } + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_INSTANCE_COUNT), instance_count); - emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_VERTEX), + gen_mi_mem32(anv_address_add(addr, 8))); if (indexed) { - emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12); - emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_BASE_VERTEX), + gen_mi_mem32(anv_address_add(addr, 12))); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_INSTANCE), + gen_mi_mem32(anv_address_add(addr, 16))); } else { - emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12); - emit_lri(batch, GEN7_3DPRIM_BASE_VERTEX, 0); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_START_INSTANCE), + gen_mi_mem32(anv_address_add(addr, 12))); + gen_mi_store(&b, gen_mi_reg32(GEN7_3DPRIM_BASE_VERTEX), gen_mi_imm(0)); } } @@ -2847,19 +3085,23 @@ void genX(CmdDrawIndirect)( genX(cmd_buffer_flush_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + for (uint32_t i = 0; i < drawCount; i++) { - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; + struct anv_address draw = anv_address_add(buffer->address, offset); - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) - emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8); + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); - load_indirect_parameters(cmd_buffer, buffer, offset, false); + load_indirect_parameters(cmd_buffer, draw, false); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; } @@ -2885,20 +3127,24 @@ void genX(CmdDrawIndexedIndirect)( genX(cmd_buffer_flush_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + for (uint32_t i = 0; i < drawCount; i++) { - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; + struct anv_address draw = anv_address_add(buffer->address, offset); /* TODO: We need to stomp base vertex to 0 somehow */ - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) - emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12); + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); - load_indirect_parameters(cmd_buffer, buffer, offset, true); + load_indirect_parameters(cmd_buffer, draw, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = RANDOM; prim.PrimitiveTopologyType = pipeline->topology; } @@ -2907,12 +3153,322 @@ void genX(CmdDrawIndexedIndirect)( } } -static VkResult -flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) +#define TMP_DRAW_COUNT_REG 0x2670 /* MI_ALU_REG14 */ + +static void +prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, + struct anv_address count_address, + const bool conditional_render_enabled) { - struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - struct anv_state surfaces = { 0, }, samplers = { 0, }; - VkResult result; + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + if (conditional_render_enabled) { +#if GEN_GEN >= 8 || GEN_IS_HASWELL + gen_mi_store(&b, gen_mi_reg64(TMP_DRAW_COUNT_REG), + gen_mi_mem32(count_address)); +#endif + } else { + /* Upload the current draw count from the draw parameters buffer to + * MI_PREDICATE_SRC0. + */ + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + gen_mi_mem32(count_address)); + + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC1 + 4), gen_mi_imm(0)); + } +} + +static void +emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, + uint32_t draw_index) +{ + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC1), gen_mi_imm(draw_index)); + + if (draw_index == 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } else { + /* While draw_index < draw_count the predicate's result will be + * (draw_index == draw_count) ^ TRUE = TRUE + * When draw_index == draw_count the result is + * (TRUE) ^ TRUE = FALSE + * After this all results will be: + * (FALSE) ^ FALSE = FALSE + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_XOR; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } +} + +#if GEN_GEN >= 8 || GEN_IS_HASWELL +static void +emit_draw_count_predicate_with_conditional_render( + struct anv_cmd_buffer *cmd_buffer, + uint32_t draw_index) +{ + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + struct gen_mi_value pred = gen_mi_ult(&b, gen_mi_imm(draw_index), + gen_mi_reg64(TMP_DRAW_COUNT_REG)); + pred = gen_mi_iand(&b, pred, gen_mi_reg64(ANV_PREDICATE_RESULT_REG)); + +#if GEN_GEN >= 8 + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_RESULT), pred); +#else + /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser + * so we emit MI_PREDICATE to set it. + */ + + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), pred); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +#endif +} +#endif + +void genX(CmdDrawIndirectCountKHR)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkBuffer _countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + genX(cmd_buffer_flush_state)(cmd_buffer); + + struct anv_address count_address = + anv_address_add(count_buffer->address, countBufferOffset); + + prepare_for_draw_count_predicate(cmd_buffer, count_address, + cmd_state->conditional_render_enabled); + + for (uint32_t i = 0; i < maxDrawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (cmd_state->conditional_render_enabled) { + emit_draw_count_predicate_with_conditional_render(cmd_buffer, i); + } else { + emit_draw_count_predicate(cmd_buffer, i); + } +#else + emit_draw_count_predicate(cmd_buffer, i); +#endif + + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + load_indirect_parameters(cmd_buffer, draw, false); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = true; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = pipeline->topology; + } + + offset += stride; + } +} + +void genX(CmdDrawIndexedIndirectCountKHR)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkBuffer _countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + genX(cmd_buffer_flush_state)(cmd_buffer); + + struct anv_address count_address = + anv_address_add(count_buffer->address, countBufferOffset); + + prepare_for_draw_count_predicate(cmd_buffer, count_address, + cmd_state->conditional_render_enabled); + + for (uint32_t i = 0; i < maxDrawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + +#if GEN_GEN >= 8 || GEN_IS_HASWELL + if (cmd_state->conditional_render_enabled) { + emit_draw_count_predicate_with_conditional_render(cmd_buffer, i); + } else { + emit_draw_count_predicate(cmd_buffer, i); + } +#else + emit_draw_count_predicate(cmd_buffer, i); +#endif + + /* TODO: We need to stomp base vertex to 0 somehow */ + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + load_indirect_parameters(cmd_buffer, draw, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = true; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = pipeline->topology; + } + + offset += stride; + } +} + +void genX(CmdBeginTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + idx >= firstCounterBuffer && + idx - firstCounterBuffer < counterBufferCount && + pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) { + uint32_t cb_idx = idx - firstCounterBuffer; + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lrm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + } + } else { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lri.DataDWord = 0; + } + } + } + + cmd_buffer->state.xfb_enabled = true; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + +void genX(CmdEndTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { + unsigned idx = firstCounterBuffer + cb_idx; + + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + cb_idx < counterBufferCount && + pCounterBuffers[cb_idx] != VK_NULL_HANDLE) { + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + } + } + } + + cmd_buffer->state.xfb_enabled = false; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + +static VkResult +flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; + struct anv_state surfaces = { 0, }, samplers = { 0, }; + VkResult result; result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); if (result != VK_SUCCESS) { @@ -2967,7 +3523,7 @@ void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - MAYBE_UNUSED VkResult result; + VkResult result; assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); @@ -3010,6 +3566,8 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) curbe.CURBEDataStartAddress = push_state.offset; } } + + cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; } cmd_buffer->state.compute.pipeline_dirty = false; @@ -3045,16 +3603,8 @@ anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, if (anv_batch_has_error(&cmd_buffer->batch)) return; - VkResult result = - anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, MESA_SHADER_COMPUTE, - base_work_group_id); - if (result != VK_SUCCESS) { - cmd_buffer->batch.status = result; - return; - } - struct anv_push_constants *push = - cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; + &cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; if (push->base_work_group_id[0] != baseGroupX || push->base_work_group_id[1] != baseGroupY || push->base_work_group_id[2] != baseGroupZ) { @@ -3101,16 +3651,19 @@ void genX(CmdDispatchBase)( sizes[0] = groupCountX; sizes[1] = groupCountY; sizes[2] = groupCountZ; - anv_state_flush(cmd_buffer->device, state); cmd_buffer->state.compute.num_workgroups = (struct anv_address) { - .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, .offset = state.offset, }; } genX(cmd_buffer_flush_compute_state)(cmd_buffer); + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) { + ggw.PredicateEnable = cmd_buffer->state.conditional_render_enabled; ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; @@ -3138,8 +3691,7 @@ void genX(CmdDispatchIndirect)( ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; + struct anv_address addr = anv_address_add(buffer->address, offset); struct anv_batch *batch = &cmd_buffer->batch; anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0); @@ -3153,49 +3705,42 @@ void genX(CmdDispatchIndirect)( return; #endif - if (prog_data->uses_num_work_groups) { - cmd_buffer->state.compute.num_workgroups = (struct anv_address) { - .bo = bo, - .offset = bo_offset, - }; - } + if (prog_data->uses_num_work_groups) + cmd_buffer->state.compute.num_workgroups = addr; genX(cmd_buffer_flush_compute_state)(cmd_buffer); - emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset); - emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4); - emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8); + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); -#if GEN_GEN <= 7 - /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ - emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0); - emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0); - emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0); + struct gen_mi_value size_x = gen_mi_mem32(anv_address_add(addr, 0)); + struct gen_mi_value size_y = gen_mi_mem32(anv_address_add(addr, 4)); + struct gen_mi_value size_z = gen_mi_mem32(anv_address_add(addr, 8)); - /* Load compute_dispatch_indirect_x_size into SRC0 */ - emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0); + gen_mi_store(&b, gen_mi_reg32(GPGPU_DISPATCHDIMX), size_x); + gen_mi_store(&b, gen_mi_reg32(GPGPU_DISPATCHDIMY), size_y); + gen_mi_store(&b, gen_mi_reg32(GPGPU_DISPATCHDIMZ), size_z); +#if GEN_GEN <= 7 /* predicate = (compute_dispatch_indirect_x_size == 0); */ + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), size_x); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOAD; mip.CombineOperation = COMBINE_SET; mip.CompareOperation = COMPARE_SRCS_EQUAL; } - /* Load compute_dispatch_indirect_y_size into SRC0 */ - emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4); - /* predicate |= (compute_dispatch_indirect_y_size == 0); */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC0), size_y); anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOAD; mip.CombineOperation = COMBINE_OR; mip.CompareOperation = COMPARE_SRCS_EQUAL; } - /* Load compute_dispatch_indirect_z_size into SRC0 */ - emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8); - /* predicate |= (compute_dispatch_indirect_z_size == 0); */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC0), size_z); anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOAD; mip.CombineOperation = COMBINE_OR; @@ -3203,17 +3748,34 @@ void genX(CmdDispatchIndirect)( } /* predicate = !predicate; */ -#define COMPARE_FALSE 1 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOADINV; mip.CombineOperation = COMBINE_OR; mip.CompareOperation = COMPARE_FALSE; } + +#if GEN_IS_HASWELL + if (cmd_buffer->state.conditional_render_enabled) { + /* predicate &= !(conditional_rendering_predicate == 0); */ + gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_SRC0), + gen_mi_reg32(ANV_PREDICATE_RESULT_REG)); + anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_AND; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } +#endif + +#else /* GEN_GEN > 7 */ + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); #endif anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) { ggw.IndirectParameterEnable = true; - ggw.PredicateEnable = GEN_GEN <= 7; + ggw.PredicateEnable = GEN_GEN <= 7 || + cmd_buffer->state.conditional_render_enabled; ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; @@ -3248,6 +3810,25 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); #endif +#if GEN_GEN == 9 + if (pipeline == _3D) { + /* There is a mid-object preemption workaround which requires you to + * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However, + * even without preemption, we have issues with geometry flickering when + * GPGPU and 3D are back-to-back and this seems to fix it. We don't + * really know why. + */ + const uint32_t subslices = + MAX2(cmd_buffer->device->instance->physicalDevice.subslice_total, 1); + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) { + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * subslices - 1; + vfe.NumberofURBEntries = 2; + vfe.URBEntryAllocationSize = 2; + } + } +#endif + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] * PIPELINE_SELECT [DevBWR+]": * @@ -3361,9 +3942,7 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) if (dw == NULL) return; - struct isl_depth_stencil_hiz_emit_info info = { - .mocs = device->default_mocs, - }; + struct isl_depth_stencil_hiz_emit_info info = { }; if (iview) info.view = &iview->planes[0].isl; @@ -3378,12 +3957,14 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) info.depth_address = anv_batch_emit_reloc(&cmd_buffer->batch, dw + device->isl_dev.ds.depth_offset / 4, - image->planes[depth_plane].bo, - image->planes[depth_plane].bo_offset + + image->planes[depth_plane].address.bo, + image->planes[depth_plane].address.offset + surface->offset); + info.mocs = + anv_mocs_for_bo(device, image->planes[depth_plane].address.bo); const uint32_t ds = - cmd_buffer->state.subpass->depth_stencil_attachment.attachment; + cmd_buffer->state.subpass->depth_stencil_attachment->attachment; info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage; if (info.hiz_usage == ISL_AUX_USAGE_HIZ) { info.hiz_surf = &image->planes[depth_plane].aux_surface.isl; @@ -3391,8 +3972,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) info.hiz_address = anv_batch_emit_reloc(&cmd_buffer->batch, dw + device->isl_dev.ds.hiz_offset / 4, - image->planes[depth_plane].bo, - image->planes[depth_plane].bo_offset + + image->planes[depth_plane].address.bo, + image->planes[depth_plane].address.offset + image->planes[depth_plane].aux_surface.offset); info.depth_clear_value = ANV_HZ_FC_VAL; @@ -3409,8 +3990,11 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) info.stencil_address = anv_batch_emit_reloc(&cmd_buffer->batch, dw + device->isl_dev.ds.stencil_offset / 4, - image->planes[stencil_plane].bo, - image->planes[stencil_plane].bo_offset + surface->offset); + image->planes[stencil_plane].address.bo, + image->planes[stencil_plane].address.offset + + surface->offset); + info.mocs = + anv_mocs_for_bo(device, image->planes[stencil_plane].address.bo); } isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); @@ -3502,7 +4086,7 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, assert(a < cmd_state->pass->attachment_count); struct anv_attachment_state *att_state = &cmd_state->attachments[a]; - struct anv_image_view *iview = fb->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view; const struct anv_image *image = iview->image; /* A resolve is necessary before use as an input attachment if the clear @@ -3525,30 +4109,38 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, target_layout = subpass->attachments[i].layout; } + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } + if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(iview->image->extent.depth, - iview->planes[0].isl.base_level); - } else { - base_layer = iview->planes[0].isl.base_array_layer; - layer_count = fb->layers; - } - transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, iview->planes[0].isl.base_level, 1, base_layer, layer_count, att_state->current_layout, target_layout); - } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + } + + if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, att_state->current_layout, target_layout); att_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, image, VK_IMAGE_ASPECT_DEPTH_BIT, target_layout); } + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_layout, target_layout); + } att_state->current_layout = target_layout; if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { @@ -3567,22 +4159,28 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, assert(iview->planes[0].isl.base_level == 0); assert(iview->planes[0].isl.base_array_layer == 0); + union isl_color_value clear_color = {}; + anv_clear_color_from_att_state(&clear_color, att_state, iview); if (iview->image->samples == 1) { - anv_image_ccs_op(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, - 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, false); + anv_image_ccs_op(cmd_buffer, image, + iview->planes[0].isl.format, + VK_IMAGE_ASPECT_COLOR_BIT, + 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, + &clear_color, + false); } else { - anv_image_mcs_op(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, - 0, 1, ISL_AUX_OP_FAST_CLEAR, false); + anv_image_mcs_op(cmd_buffer, image, + iview->planes[0].isl.format, + VK_IMAGE_ASPECT_COLOR_BIT, + 0, 1, ISL_AUX_OP_FAST_CLEAR, + &clear_color, + false); } base_clear_layer++; clear_layer_count--; if (is_multiview) att_state->pending_clear_views &= ~1; - genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color.state, - image, VK_IMAGE_ASPECT_COLOR_BIT, - true /* copy from ss */); - if (att_state->clear_color_is_zero) { /* This image has the auxiliary buffer enabled. We can mark the * subresource as not needing a resolve because the clear color @@ -3693,8 +4291,9 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, assert(att_state->pending_clear_aspects == 0); } - if ((att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && - image->planes[0].aux_surface.isl.size > 0 && + if (GEN_GEN < 10 && + (att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && + image->planes[0].aux_surface.isl.size_B > 0 && iview->planes[0].isl.base_level == 0 && iview->planes[0].isl.base_array_layer == 0) { if (att_state->aux_usage != ISL_AUX_USAGE_NONE) { @@ -3769,16 +4368,250 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, cmd_buffer_emit_depth_stencil(cmd_buffer); } +static enum blorp_filter +vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode) +{ + switch (vk_mode) { + case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR: + return BLORP_FILTER_SAMPLE_0; + case VK_RESOLVE_MODE_AVERAGE_BIT_KHR: + return BLORP_FILTER_AVERAGE; + case VK_RESOLVE_MODE_MIN_BIT_KHR: + return BLORP_FILTER_MIN_SAMPLE; + case VK_RESOLVE_MODE_MAX_BIT_KHR: + return BLORP_FILTER_MAX_SAMPLE; + default: + return BLORP_FILTER_NONE; + } +} + static void cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) { struct anv_cmd_state *cmd_state = &cmd_buffer->state; struct anv_subpass *subpass = cmd_state->subpass; uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); + struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; - anv_cmd_buffer_resolve_subpass(cmd_buffer); + if (subpass->has_color_resolve) { + /* We are about to do some MSAA resolves. We need to flush so that the + * result of writes to the MSAA color attachments show up in the sampler + * when we blit to the single-sampled resolve target. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + + for (uint32_t i = 0; i < subpass->color_count; ++i) { + uint32_t src_att = subpass->color_attachments[i].attachment; + uint32_t dst_att = subpass->resolve_attachments[i].attachment; + + if (dst_att == VK_ATTACHMENT_UNUSED) + continue; + + assert(src_att < cmd_buffer->state.pass->attachment_count); + assert(dst_att < cmd_buffer->state.pass->attachment_count); + + if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) { + /* From the Vulkan 1.0 spec: + * + * If the first use of an attachment in a render pass is as a + * resolve attachment, then the loadOp is effectively ignored + * as the resolve is guaranteed to overwrite all pixels in the + * render area. + */ + cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; + } + + struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; + struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; + + const VkRect2D render_area = cmd_buffer->state.render_area; + + enum isl_aux_usage src_aux_usage = + cmd_buffer->state.attachments[src_att].aux_usage; + enum isl_aux_usage dst_aux_usage = + cmd_buffer->state.attachments[dst_att].aux_usage; + + assert(src_iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && + dst_iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT); + + anv_image_msaa_resolve(cmd_buffer, + src_iview->image, src_aux_usage, + src_iview->planes[0].isl.base_level, + src_iview->planes[0].isl.base_array_layer, + dst_iview->image, dst_aux_usage, + dst_iview->planes[0].isl.base_level, + dst_iview->planes[0].isl.base_array_layer, + VK_IMAGE_ASPECT_COLOR_BIT, + render_area.offset.x, render_area.offset.y, + render_area.offset.x, render_area.offset.y, + render_area.extent.width, + render_area.extent.height, + fb->layers, BLORP_FILTER_NONE); + } + } + + if (subpass->ds_resolve_attachment) { + /* We are about to do some MSAA resolves. We need to flush so that the + * result of writes to the MSAA depth attachments show up in the sampler + * when we blit to the single-sampled resolve target. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; + + uint32_t src_att = subpass->depth_stencil_attachment->attachment; + uint32_t dst_att = subpass->ds_resolve_attachment->attachment; + + assert(src_att < cmd_buffer->state.pass->attachment_count); + assert(dst_att < cmd_buffer->state.pass->attachment_count); + + if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) { + /* From the Vulkan 1.0 spec: + * + * If the first use of an attachment in a render pass is as a + * resolve attachment, then the loadOp is effectively ignored + * as the resolve is guaranteed to overwrite all pixels in the + * render area. + */ + cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; + } + + struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; + struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; + + const VkRect2D render_area = cmd_buffer->state.render_area; + + if ((src_iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && + subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { + + struct anv_attachment_state *src_state = + &cmd_state->attachments[src_att]; + struct anv_attachment_state *dst_state = + &cmd_state->attachments[dst_att]; + + /* MSAA resolves sample from the source attachment. Transition the + * depth attachment first to get rid of any HiZ that we may not be + * able to handle. + */ + transition_depth_buffer(cmd_buffer, src_iview->image, + src_state->current_layout, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + src_state->aux_usage = + anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image, + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + src_state->current_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + /* MSAA resolves write to the resolve attachment as if it were any + * other transfer op. Transition the resolve attachment accordingly. + */ + VkImageLayout dst_initial_layout = dst_state->current_layout; + + /* If our render area is the entire size of the image, we're going to + * blow it all away so we can claim the initial layout is UNDEFINED + * and we'll get a HiZ ambiguate instead of a resolve. + */ + if (dst_iview->image->type != VK_IMAGE_TYPE_3D && + render_area.offset.x == 0 && render_area.offset.y == 0 && + render_area.extent.width == dst_iview->extent.width && + render_area.extent.height == dst_iview->extent.height) + dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED; + + transition_depth_buffer(cmd_buffer, dst_iview->image, + dst_initial_layout, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + dst_state->aux_usage = + anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image, + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + + enum blorp_filter filter = + vk_to_blorp_resolve_mode(subpass->depth_resolve_mode); + + anv_image_msaa_resolve(cmd_buffer, + src_iview->image, src_state->aux_usage, + src_iview->planes[0].isl.base_level, + src_iview->planes[0].isl.base_array_layer, + dst_iview->image, dst_state->aux_usage, + dst_iview->planes[0].isl.base_level, + dst_iview->planes[0].isl.base_array_layer, + VK_IMAGE_ASPECT_DEPTH_BIT, + render_area.offset.x, render_area.offset.y, + render_area.offset.x, render_area.offset.y, + render_area.extent.width, + render_area.extent.height, + fb->layers, filter); + } + + if ((src_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { + + enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE; + enum isl_aux_usage dst_aux_usage = ISL_AUX_USAGE_NONE; + + enum blorp_filter filter = + vk_to_blorp_resolve_mode(subpass->stencil_resolve_mode); + + anv_image_msaa_resolve(cmd_buffer, + src_iview->image, src_aux_usage, + src_iview->planes[0].isl.base_level, + src_iview->planes[0].isl.base_array_layer, + dst_iview->image, dst_aux_usage, + dst_iview->planes[0].isl.base_level, + dst_iview->planes[0].isl.base_array_layer, + VK_IMAGE_ASPECT_STENCIL_BIT, + render_area.offset.x, render_area.offset.y, + render_area.offset.x, render_area.offset.y, + render_area.extent.width, + render_area.extent.height, + fb->layers, filter); + } + } + +#if GEN_GEN == 7 + /* On gen7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in three + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the + * other two, we delay the copy until a transition into some other layout. + */ + if (subpass->depth_stencil_attachment) { + uint32_t a = subpass->depth_stencil_attachment->attachment; + assert(a != VK_ATTACHMENT_UNUSED); + + struct anv_attachment_state *att_state = &cmd_state->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view;; + const struct anv_image *image = iview->image; + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + uint32_t plane = anv_image_aspect_to_plane(image->aspects, + VK_IMAGE_ASPECT_STENCIL_BIT); + + if (image->planes[plane].shadow_surface.isl.size_B > 0 && + att_state->current_layout == VK_IMAGE_LAYOUT_GENERAL) { + assert(image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + iview->planes[plane].isl.base_level, 1, + iview->planes[plane].isl.base_array_layer, + fb->layers); + } + } + } +#endif /* GEN_GEN == 7 */ - struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; for (uint32_t i = 0; i < subpass->attachment_count; ++i) { const uint32_t a = subpass->attachments[i].attachment; if (a == VK_ATTACHMENT_UNUSED) @@ -3789,34 +4622,91 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) assert(a < cmd_state->pass->attachment_count); struct anv_attachment_state *att_state = &cmd_state->attachments[a]; - struct anv_image_view *iview = fb->attachments[a]; + struct anv_image_view *iview = cmd_state->attachments[a].image_view; const struct anv_image *image = iview->image; + if ((image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && + image->vk_format != iview->vk_format) { + enum anv_fast_clear_type fast_clear_type = + anv_layout_to_fast_clear_type(&cmd_buffer->device->info, + image, VK_IMAGE_ASPECT_COLOR_BIT, + att_state->current_layout); + + /* If any clear color was used, flush it down the aux surfaces. If we + * don't do it now using the view's format we might use the clear + * color incorrectly in the following resolves (for example with an + * SRGB view & a UNORM image). + */ + if (fast_clear_type != ANV_FAST_CLEAR_NONE) { + anv_perf_warn(cmd_buffer->device->instance, iview, + "Doing a partial resolve to get rid of clear color at the " + "end of a renderpass due to an image/view format mismatch"); + + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } + + for (uint32_t a = 0; a < layer_count; a++) { + uint32_t array_layer = base_layer + a; + if (image->samples == 1) { + anv_cmd_predicated_ccs_resolve(cmd_buffer, image, + iview->planes[0].isl.format, + VK_IMAGE_ASPECT_COLOR_BIT, + iview->planes[0].isl.base_level, + array_layer, + ISL_AUX_OP_PARTIAL_RESOLVE, + ANV_FAST_CLEAR_NONE); + } else { + anv_cmd_predicated_mcs_resolve(cmd_buffer, image, + iview->planes[0].isl.format, + VK_IMAGE_ASPECT_COLOR_BIT, + base_layer, + ISL_AUX_OP_PARTIAL_RESOLVE, + ANV_FAST_CLEAR_NONE); + } + } + } + } + /* Transition the image into the final layout for this render pass */ VkImageLayout target_layout = cmd_state->pass->attachments[a].final_layout; + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } + if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); - - uint32_t base_layer, layer_count; - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(iview->image->extent.depth, - iview->planes[0].isl.base_level); - } else { - base_layer = iview->planes[0].isl.base_array_layer; - layer_count = fb->layers; - } - transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, iview->planes[0].isl.base_level, 1, base_layer, layer_count, att_state->current_layout, target_layout); - } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + } + + if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, att_state->current_layout, target_layout); } + + if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_layout, target_layout); + } } /* Accumulate any subpass flushes that need to happen after the subpass. @@ -3854,6 +4744,15 @@ void genX(CmdBeginRenderPass)( cmd_buffer_begin_subpass(cmd_buffer, 0); } +void genX(CmdBeginRenderPass2KHR)( + VkCommandBuffer commandBuffer, + const VkRenderPassBeginInfo* pRenderPassBeginInfo, + const VkSubpassBeginInfoKHR* pSubpassBeginInfo) +{ + genX(CmdBeginRenderPass)(commandBuffer, pRenderPassBeginInfo, + pSubpassBeginInfo->contents); +} + void genX(CmdNextSubpass)( VkCommandBuffer commandBuffer, VkSubpassContents contents) @@ -3870,6 +4769,14 @@ void genX(CmdNextSubpass)( cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); } +void genX(CmdNextSubpass2KHR)( + VkCommandBuffer commandBuffer, + const VkSubpassBeginInfoKHR* pSubpassBeginInfo, + const VkSubpassEndInfoKHR* pSubpassEndInfo) +{ + genX(CmdNextSubpass)(commandBuffer, pSubpassBeginInfo->contents); +} + void genX(CmdEndRenderPass)( VkCommandBuffer commandBuffer) { @@ -3883,7 +4790,7 @@ void genX(CmdEndRenderPass)( cmd_buffer->state.hiz_enabled = false; #ifndef NDEBUG - anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); + anv_dump_add_attachments(cmd_buffer); #endif /* Remove references to render pass specific state. This enables us to @@ -3893,3 +4800,189 @@ void genX(CmdEndRenderPass)( cmd_buffer->state.pass = NULL; cmd_buffer->state.subpass = NULL; } + +void genX(CmdEndRenderPass2KHR)( + VkCommandBuffer commandBuffer, + const VkSubpassEndInfoKHR* pSubpassEndInfo) +{ + genX(CmdEndRenderPass)(commandBuffer); +} + +void +genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer) +{ +#if GEN_GEN >= 8 || GEN_IS_HASWELL + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC0), + gen_mi_reg32(ANV_PREDICATE_RESULT_REG)); + gen_mi_store(&b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +#endif +} + +#if GEN_GEN >= 8 || GEN_IS_HASWELL +void genX(CmdBeginConditionalRenderingEXT)( + VkCommandBuffer commandBuffer, + const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_address value_address = + anv_address_add(buffer->address, pConditionalRenderingBegin->offset); + + const bool isInverted = pConditionalRenderingBegin->flags & + VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + + cmd_state->conditional_render_enabled = true; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + /* Section 19.4 of the Vulkan 1.1.85 spec says: + * + * If the value of the predicate in buffer memory changes + * while conditional rendering is active, the rendering commands + * may be discarded in an implementation-dependent way. + * Some implementations may latch the value of the predicate + * upon beginning conditional rendering while others + * may read it before every rendering command. + * + * So it's perfectly fine to read a value from the buffer once. + */ + struct gen_mi_value value = gen_mi_mem32(value_address); + + /* Precompute predicate result, it is necessary to support secondary + * command buffers since it is unknown if conditional rendering is + * inverted when populating them. + */ + gen_mi_store(&b, gen_mi_reg64(ANV_PREDICATE_RESULT_REG), + isInverted ? gen_mi_uge(&b, gen_mi_imm(0), value) : + gen_mi_ult(&b, gen_mi_imm(0), value)); +} + +void genX(CmdEndConditionalRenderingEXT)( + VkCommandBuffer commandBuffer) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + + cmd_state->conditional_render_enabled = false; +} +#endif + +/* Set of stage bits for which are pipelined, i.e. they get queued by the + * command streamer for later execution. + */ +#define ANV_PIPELINE_STAGE_PIPELINED_BITS \ + (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | \ + VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | \ + VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | \ + VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | \ + VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | \ + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | \ + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | \ + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | \ + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | \ + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | \ + VK_PIPELINE_STAGE_TRANSFER_BIT | \ + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | \ + VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | \ + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT) + +void genX(CmdSetEvent)( + VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT, + pc.PostSyncOperation = WriteImmediateData, + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_SET; + } +} + +void genX(CmdResetEvent)( + VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_RESET; + } +} + +void genX(CmdWaitEvents)( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags destStageMask, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ +#if GEN_GEN >= 8 + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + for (uint32_t i = 0; i < eventCount; i++) { + ANV_FROM_HANDLE(anv_event, event, pEvents[i]); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) { + sem.WaitMode = PollingMode, + sem.CompareOperation = COMPARE_SAD_EQUAL_SDD, + sem.SemaphoreDataDword = VK_EVENT_SET, + sem.SemaphoreAddress = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + } + } +#else + anv_finishme("Implement events on gen7"); +#endif + + genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask, + false, /* byRegion */ + memoryBarrierCount, pMemoryBarriers, + bufferMemoryBarrierCount, pBufferMemoryBarriers, + imageMemoryBarrierCount, pImageMemoryBarriers); +}