X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2FgenX_cmd_buffer.c;h=ce546249b34bff7d0134228221c4b782e94c4590;hb=52056206e171f8eec0afc16cfd90ee68bf290e7b;hp=3691b4bdec9a3ed488fb000fd2cc33a1a7079ffb;hpb=20578f81a6a9a7d13b97083863d1240ac13aa5bd;p=mesa.git diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 3691b4bdec9..ce546249b34 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -202,24 +202,6 @@ add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, } } -static bool -color_is_zero_one(VkClearColorValue value, enum isl_format format) -{ - if (isl_format_has_int_channel(format)) { - for (unsigned i = 0; i < 4; i++) { - if (value.int32[i] != 0 && value.int32[i] != 1) - return false; - } - } else { - for (unsigned i = 0; i < 4; i++) { - if (value.float32[i] != 0.0f && value.float32[i] != 1.0f) - return false; - } - } - - return true; -} - static void color_attachment_compute_aux_usage(struct anv_device * device, struct anv_cmd_state * cmd_state, @@ -241,16 +223,27 @@ color_attachment_compute_aux_usage(struct anv_device * device, att_state->input_aux_usage = ISL_AUX_USAGE_NONE; att_state->fast_clear = false; return; - } else if (iview->image->planes[0].aux_usage == ISL_AUX_USAGE_MCS) { - att_state->aux_usage = ISL_AUX_USAGE_MCS; + } + + att_state->aux_usage = + anv_layout_to_aux_usage(&device->info, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); + + /* If we don't have aux, then we should have returned early in the layer + * check above. If we got here, we must have something. + */ + assert(att_state->aux_usage != ISL_AUX_USAGE_NONE); + + if (att_state->aux_usage == ISL_AUX_USAGE_MCS) { att_state->input_aux_usage = ISL_AUX_USAGE_MCS; att_state->fast_clear = false; return; - } else if (iview->image->planes[0].aux_usage == ISL_AUX_USAGE_CCS_E) { - att_state->aux_usage = ISL_AUX_USAGE_CCS_E; + } + + if (att_state->aux_usage == ISL_AUX_USAGE_CCS_E) { att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E; } else { - att_state->aux_usage = ISL_AUX_USAGE_CCS_D; /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: * * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D @@ -283,17 +276,46 @@ color_attachment_compute_aux_usage(struct anv_device * device, assert(iview->image->planes[0].aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT); + const struct isl_format_layout *view_fmtl = + isl_format_get_layout(iview->planes[0].isl.format); + union isl_color_value clear_color = {}; + +#define COPY_CLEAR_COLOR_CHANNEL(c, i) \ + if (view_fmtl->channels.c.bits) \ + clear_color.u32[i] = att_state->clear_value.color.uint32[i] + + COPY_CLEAR_COLOR_CHANNEL(r, 0); + COPY_CLEAR_COLOR_CHANNEL(g, 1); + COPY_CLEAR_COLOR_CHANNEL(b, 2); + COPY_CLEAR_COLOR_CHANNEL(a, 3); + +#undef COPY_CLEAR_COLOR_CHANNEL + att_state->clear_color_is_zero_one = - color_is_zero_one(att_state->clear_value.color, iview->planes[0].isl.format); + isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format); att_state->clear_color_is_zero = - att_state->clear_value.color.uint32[0] == 0 && - att_state->clear_value.color.uint32[1] == 0 && - att_state->clear_value.color.uint32[2] == 0 && - att_state->clear_value.color.uint32[3] == 0; + isl_color_value_is_zero(clear_color, iview->planes[0].isl.format); if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { - /* Start off assuming fast clears are possible */ - att_state->fast_clear = true; + /* Start by getting the fast clear type. We use the first subpass + * layout here because we don't want to fast-clear if the first subpass + * to use the attachment can't handle fast-clears. + */ + enum anv_fast_clear_type fast_clear_type = + anv_layout_to_fast_clear_type(&device->info, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + cmd_state->pass->attachments[att].first_subpass_layout); + switch (fast_clear_type) { + case ANV_FAST_CLEAR_NONE: + att_state->fast_clear = false; + break; + case ANV_FAST_CLEAR_DEFAULT_VALUE: + att_state->fast_clear = att_state->clear_color_is_zero; + break; + case ANV_FAST_CLEAR_ANY: + att_state->fast_clear = true; + break; + } /* Potentially, we could do partial fast-clears but doing so has crazy * alignment restrictions. It's easier to just restrict to full size @@ -309,43 +331,101 @@ color_attachment_compute_aux_usage(struct anv_device * device, if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one) att_state->fast_clear = false; - /* We allow fast clears when all aux layers of the miplevel are targeted. - * See add_fast_clear_state_buffer() for more information. Also, because - * we only either do a fast clear or a normal clear and not both, this - * complies with the gen7 restriction of not fast-clearing multiple - * layers. + /* We only allow fast clears to the first slice of an image (level 0, + * layer 0) and only for the entire slice. This guarantees us that, at + * any given time, there is only one clear color on any given image at + * any given time. At the time of our testing (Jan 17, 2018), there + * were no known applications which would benefit from fast-clearing + * more than just the first slice. */ - if (cmd_state->framebuffer->layers != - anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level)) { + if (att_state->fast_clear && + (iview->planes[0].isl.base_level > 0 || + iview->planes[0].isl.base_array_layer > 0)) { + anv_perf_warn(device->instance, iview->image, + "Rendering with multi-lod or multi-layer framebuffer " + "with LOAD_OP_LOAD and baseMipLevel > 0 or " + "baseArrayLayer > 0. Not fast clearing."); att_state->fast_clear = false; - if (GEN_GEN == 7) { - anv_perf_warn(device->instance, iview->image, - "Not fast-clearing the first layer in " - "a multi-layer fast clear."); - } + } else if (att_state->fast_clear && cmd_state->framebuffer->layers > 1) { + anv_perf_warn(device->instance, iview->image, + "Rendering to a multi-layer framebuffer with " + "LOAD_OP_CLEAR. Only fast-clearing the first slice"); } - /* We only allow fast clears in the GENERAL layout if the auxiliary - * buffer is always enabled and the fast-clear value is all 0's. See - * add_fast_clear_state_buffer() for more information. - */ - if (cmd_state->pass->attachments[att].first_subpass_layout == - VK_IMAGE_LAYOUT_GENERAL && - (!att_state->clear_color_is_zero || - iview->image->planes[0].aux_usage == ISL_AUX_USAGE_NONE)) { - att_state->fast_clear = false; - } - - if (att_state->fast_clear) { - memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32, - sizeof(fast_clear_color->u32)); - } + if (att_state->fast_clear) + *fast_clear_color = clear_color; } else { att_state->fast_clear = false; } } +static void +depth_stencil_attachment_compute_aux_usage(struct anv_device *device, + struct anv_cmd_state *cmd_state, + uint32_t att, VkRect2D render_area) +{ + struct anv_render_pass_attachment *pass_att = + &cmd_state->pass->attachments[att]; + struct anv_attachment_state *att_state = &cmd_state->attachments[att]; + struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; + + /* These will be initialized after the first subpass transition. */ + att_state->aux_usage = ISL_AUX_USAGE_NONE; + att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + + if (GEN_GEN == 7) { + /* We don't do any HiZ or depth fast-clears on gen7 yet */ + att_state->fast_clear = false; + return; + } + + if (!(att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { + /* If we're just clearing stencil, we can always HiZ clear */ + att_state->fast_clear = true; + return; + } + + /* Default to false for now */ + att_state->fast_clear = false; + + /* We must have depth in order to have HiZ */ + if (!(iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) + return; + + const enum isl_aux_usage first_subpass_aux_usage = + anv_layout_to_aux_usage(&device->info, iview->image, + VK_IMAGE_ASPECT_DEPTH_BIT, + pass_att->first_subpass_layout); + if (first_subpass_aux_usage != ISL_AUX_USAGE_HIZ) + return; + + if (!blorp_can_hiz_clear_depth(GEN_GEN, + iview->planes[0].isl.format, + iview->image->samples, + render_area.offset.x, + render_area.offset.y, + render_area.offset.x + + render_area.extent.width, + render_area.offset.y + + render_area.extent.height)) + return; + + if (att_state->clear_value.depthStencil.depth != ANV_HZ_FC_VAL) + return; + + if (GEN_GEN == 8 && anv_can_sample_with_hiz(&device->info, iview->image)) { + /* Only gen9+ supports returning ANV_HZ_FC_VAL when sampling a + * fast-cleared portion of a HiZ buffer. Testing has revealed that Gen8 + * only supports returning 0.0f. Gens prior to gen8 do not support this + * feature at all. + */ + return; + } + + /* If we got here, then we can fast clear */ + att_state->fast_clear = true; +} + static bool need_input_attachment_state(const struct anv_render_pass_attachment *att) { @@ -369,18 +449,6 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, VkImageLayout initial_layout, VkImageLayout final_layout) { - assert(image); - - /* A transition is a no-op if HiZ is not enabled, or if the initial and - * final layouts are equal. - * - * The undefined layout indicates that the user doesn't care about the data - * that's currently in the buffer. Therefore, a data-preserving resolve - * operation is not needed. - */ - if (image->planes[0].aux_usage != ISL_AUX_USAGE_HIZ || initial_layout == final_layout) - return; - const bool hiz_enabled = ISL_AUX_USAGE_HIZ == anv_layout_to_aux_usage(&cmd_buffer->device->info, image, VK_IMAGE_ASPECT_DEPTH_BIT, initial_layout); @@ -388,98 +456,289 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, anv_layout_to_aux_usage(&cmd_buffer->device->info, image, VK_IMAGE_ASPECT_DEPTH_BIT, final_layout); - enum blorp_hiz_op hiz_op; + enum isl_aux_op hiz_op; if (hiz_enabled && !enable_hiz) { - hiz_op = BLORP_HIZ_OP_DEPTH_RESOLVE; + hiz_op = ISL_AUX_OP_FULL_RESOLVE; } else if (!hiz_enabled && enable_hiz) { - hiz_op = BLORP_HIZ_OP_HIZ_RESOLVE; + hiz_op = ISL_AUX_OP_AMBIGUATE; } else { assert(hiz_enabled == enable_hiz); /* If the same buffer will be used, no resolves are necessary. */ - hiz_op = BLORP_HIZ_OP_NONE; + hiz_op = ISL_AUX_OP_NONE; } - if (hiz_op != BLORP_HIZ_OP_NONE) - anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op); + if (hiz_op != ISL_AUX_OP_NONE) + anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, 0, 1, hiz_op); } #define MI_PREDICATE_SRC0 0x2400 #define MI_PREDICATE_SRC1 0x2408 -/* Manages the state of an color image subresource to ensure resolves are - * performed properly. - */ static void -genX(set_image_needs_resolve)(struct anv_cmd_buffer *cmd_buffer, - const struct anv_image *image, - VkImageAspectFlagBits aspect, - unsigned level, bool needs_resolve) +set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t level, + uint32_t base_layer, uint32_t layer_count, + bool compressed) { - assert(cmd_buffer && image); - assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - assert(level < anv_image_aux_levels(image, aspect)); + uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - /* The HW docs say that there is no way to guarantee the completion of - * the following command. We use it nevertheless because it shows no - * issues in testing is currently being used in the GL driver. - */ + /* We only have compression tracking for CCS_E */ + if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E) + return; + + for (uint32_t a = 0; a < layer_count; a++) { + uint32_t layer = base_layer + a; + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device, + image, aspect, + level, layer); + sdi.ImmediateData = compressed ? UINT32_MAX : 0; + } + } +} + +static void +set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum anv_fast_clear_type fast_clear) +{ anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { - sdi.Address = anv_image_get_needs_resolve_addr(cmd_buffer->device, - image, aspect, level); - sdi.ImmediateData = needs_resolve; + sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect); + sdi.ImmediateData = fast_clear; } + + /* Whenever we have fast-clear, we consider that slice to be compressed. + * This makes building predicates much easier. + */ + if (fast_clear != ANV_FAST_CLEAR_NONE) + set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); +} + +#if GEN_IS_HASWELL || GEN_GEN >= 8 +static inline uint32_t +mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2) +{ + struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { + .ALUOpcode = opcode, + .Operand1 = operand1, + .Operand2 = operand2, + }; + + uint32_t dw; + GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); + + return dw; } +#endif + +#define CS_GPR(n) (0x2600 + (n) * 8) static void -genX(load_needs_resolve_predicate)(struct anv_cmd_buffer *cmd_buffer, - const struct anv_image *image, - VkImageAspectFlagBits aspect, - unsigned level) +anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t level, uint32_t array_layer, + enum isl_aux_op resolve_op, + enum anv_fast_clear_type fast_clear_supported) { - assert(cmd_buffer && image); - assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - assert(level < anv_image_aux_levels(image, aspect)); + const uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); + struct anv_address fast_clear_type_addr = + anv_image_get_fast_clear_type_addr(cmd_buffer->device, image, aspect); + +#if GEN_GEN >= 9 + /* Name some registers */ + const int image_fc_reg = MI_ALU_REG0; + const int fc_imm_reg = MI_ALU_REG1; + const int pred_reg = MI_ALU_REG2; + + uint32_t *dw; + + if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) { + /* In this case, we're doing a full resolve which means we want the + * resolve to happen if any compression (including fast-clears) is + * present. + * + * In order to simplify the logic a bit, we make the assumption that, + * if the first slice has been fast-cleared, it is also marked as + * compressed. See also set_image_fast_clear_state. + */ + struct anv_address compression_state_addr = + anv_image_get_compression_state_addr(cmd_buffer->device, image, + aspect, level, array_layer); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = MI_PREDICATE_SRC0; + lrm.MemoryAddress = compression_state_addr; + } + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = compression_state_addr; + sdi.ImmediateData = 0; + } + + if (level == 0 && array_layer == 0) { + /* If the predicate is true, we want to write 0 to the fast clear type + * and, if it's false, leave it alone. We can do this by writing + * + * clear_type = clear_type & ~predicate; + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = CS_GPR(image_fc_reg); + lrm.MemoryAddress = fast_clear_type_addr; + } + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_REG), lrr) { + lrr.DestinationRegisterAddress = CS_GPR(pred_reg); + lrr.SourceRegisterAddress = MI_PREDICATE_SRC0; + } + + dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); + dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); + dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); + dw[3] = mi_alu(MI_ALU_AND, 0, 0); + dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.MemoryAddress = fast_clear_type_addr; + srm.RegisterAddress = CS_GPR(image_fc_reg); + } + } + } else if (level == 0 && array_layer == 0) { + /* In this case, we are doing a partial resolve to get rid of fast-clear + * colors. We don't care about the compression state but we do care + * about how much fast clear is allowed by the final layout. + */ + assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); + assert(fast_clear_supported < ANV_FAST_CLEAR_ANY); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = CS_GPR(image_fc_reg); + lrm.MemoryAddress = fast_clear_type_addr; + } + emit_lri(&cmd_buffer->batch, CS_GPR(image_fc_reg) + 4, 0); + + emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg), fast_clear_supported); + emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg) + 4, 0); + + /* We need to compute (fast_clear_supported < image->fast_clear). + * We do this by subtracting and storing the carry bit. + */ + dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); + dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, fc_imm_reg); + dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, image_fc_reg); + dw[3] = mi_alu(MI_ALU_SUB, 0, 0); + dw[4] = mi_alu(MI_ALU_STORE, pred_reg, MI_ALU_CF); + + /* Store the predicate */ + emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(pred_reg)); + + /* If the predicate is true, we want to write 0 to the fast clear type + * and, if it's false, leave it alone. We can do this by writing + * + * clear_type = clear_type & ~predicate; + */ + dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); + dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); + dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); + dw[3] = mi_alu(MI_ALU_AND, 0, 0); + dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.RegisterAddress = CS_GPR(image_fc_reg); + srm.MemoryAddress = fast_clear_type_addr; + } + } else { + /* In this case, we're trying to do a partial resolve on a slice that + * doesn't have clear color. There's nothing to do. + */ + assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); + return; + } + +#else /* GEN_GEN <= 8 */ + assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); + assert(fast_clear_supported != ANV_FAST_CLEAR_ANY); + + /* We don't support fast clears on anything other than the first slice. */ + if (level > 0 || array_layer > 0) + return; - const struct anv_address resolve_flag_addr = - anv_image_get_needs_resolve_addr(cmd_buffer->device, - image, aspect, level); + /* On gen8, we don't have a concept of default clear colors because we + * can't sample from CCS surfaces. It's enough to just load the fast clear + * state into the predicate register. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = MI_PREDICATE_SRC0; + lrm.MemoryAddress = fast_clear_type_addr; + } + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = fast_clear_type_addr; + sdi.ImmediateData = 0; + } +#endif - /* Make the pending predicated resolve a no-op if one is not needed. - * predicate = do_resolve = resolve_flag != 0; + /* We use the first half of src0 for the actual predicate. Set the second + * half of src0 and all of src1 to 0 as the predicate operation will be + * doing an implicit src0 != src1. */ + emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0); emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); - emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 , 0); - emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, - resolve_flag_addr.bo, resolve_flag_addr.offset); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { mip.LoadOperation = LOAD_LOADINV; mip.CombineOperation = COMBINE_SET; mip.CompareOperation = COMPARE_SRCS_EQUAL; } + + /* CCS_D only supports full resolves and BLORP will assert on us if we try + * to do a partial resolve on a CCS_D surface. + */ + if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && + image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + resolve_op = ISL_AUX_OP_FULL_RESOLVE; + + anv_image_ccs_op(cmd_buffer, image, aspect, level, + array_layer, 1, resolve_op, true); +} + +void +genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum isl_aux_usage aux_usage, + uint32_t level, + uint32_t base_layer, + uint32_t layer_count) +{ + /* The aspect must be exactly one of the image aspects. */ + assert(_mesa_bitcount(aspect) == 1 && (aspect & image->aspects)); + + /* The only compression types with more than just fast-clears are MCS, + * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually + * track the current fast-clear and compression state. This leaves us + * with just MCS and CCS_E. + */ + if (aux_usage != ISL_AUX_USAGE_CCS_E && + aux_usage != ISL_AUX_USAGE_MCS) + return; + + set_image_compressed_bit(cmd_buffer, image, aspect, + level, base_layer, layer_count, true); } static void -init_fast_clear_state_entry(struct anv_cmd_buffer *cmd_buffer, - const struct anv_image *image, - VkImageAspectFlagBits aspect, - unsigned level) +init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect) { assert(cmd_buffer && image); assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - assert(level < anv_image_aux_levels(image, aspect)); - uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - enum isl_aux_usage aux_usage = image->planes[plane].aux_usage; - - /* The resolve flag should updated to signify that fast-clear/compression - * data needs to be removed when leaving the undefined layout. Such data - * may need to be removed if it would cause accesses to the color buffer - * to return incorrect data. The fast clear data in CCS_D buffers should - * be removed because CCS_D isn't enabled all the time. - */ - genX(set_image_needs_resolve)(cmd_buffer, image, aspect, level, - aux_usage == ISL_AUX_USAGE_NONE); + set_image_fast_clear_state(cmd_buffer, image, aspect, + ANV_FAST_CLEAR_NONE); /* The fast clear value dword(s) will be copied into a surface state object. * Ensure that the restrictions of the fields in the dword(s) are followed. @@ -493,7 +752,7 @@ init_fast_clear_state_entry(struct anv_cmd_buffer *cmd_buffer, * values in the clear value dword(s). */ struct anv_address addr = - anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect, level); + anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); unsigned i = 0; for (; i < cmd_buffer->device->isl_dev.ss.clear_value_size; i += 4) { anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { @@ -501,7 +760,7 @@ init_fast_clear_state_entry(struct anv_cmd_buffer *cmd_buffer, if (GEN_GEN >= 9) { /* MCS buffers on SKL+ can only have 1/0 clear colors. */ - assert(aux_usage == ISL_AUX_USAGE_MCS); + assert(image->samples > 1); sdi.ImmediateData = 0; } else if (GEN_VERSIONx10 >= 75) { /* Pre-SKL, the dword containing the clear values also contains @@ -534,19 +793,17 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, struct anv_state surface_state, const struct anv_image *image, VkImageAspectFlagBits aspect, - unsigned level, bool copy_from_surface_state) { assert(cmd_buffer && image); assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - assert(level < anv_image_aux_levels(image, aspect)); struct anv_bo *ss_bo = &cmd_buffer->device->surface_state_pool.block_pool.bo; uint32_t ss_clear_offset = surface_state.offset + cmd_buffer->device->isl_dev.ss.clear_value_offset; const struct anv_address entry_addr = - anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect, level); + anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; if (copy_from_surface_state) { @@ -592,6 +849,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, VkImageLayout initial_layout, VkImageLayout final_layout) { + const struct gen_device_info *devinfo = &cmd_buffer->device->info; /* Validate the inputs. */ assert(cmd_buffer); assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); @@ -635,102 +893,128 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) return; - /* A transition of a 3D subresource works on all slices at a time. */ - if (image->type == VK_IMAGE_TYPE_3D) { - base_layer = 0; - layer_count = anv_minify(image->extent.depth, base_level); - } - - /* We're interested in the subresource range subset that has aux data. */ - level_count = MIN2(level_count, anv_image_aux_levels(image, aspect) - base_level); - layer_count = MIN2(layer_count, - anv_image_aux_layers(image, aspect, base_level) - base_layer); - last_level_num = base_level + level_count; - - /* Record whether or not the layout is undefined. Pre-initialized images - * with auxiliary buffers have a non-linear layout and are thus undefined. - */ assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); - const bool undef_layout = initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || - initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED; - /* Do preparatory work before the resolve operation or return early if no - * resolve is actually needed. - */ - if (undef_layout) { + if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || + initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { /* A subresource in the undefined layout may have been aliased and * populated with any arrangement of bits. Therefore, we must initialize * the related aux buffer and clear buffer entry with desirable values. + * An initial layout of PREINITIALIZED is the same as UNDEFINED for + * images with VK_IMAGE_TILING_OPTIMAL. * * Initialize the relevant clear buffer entries. */ - for (unsigned level = base_level; level < last_level_num; level++) - init_fast_clear_state_entry(cmd_buffer, image, aspect, level); + if (base_level == 0 && base_layer == 0) + init_fast_clear_color(cmd_buffer, image, aspect); - /* Initialize the aux buffers to enable correct rendering. This operation - * requires up to two steps: one to rid the aux buffer of data that may - * cause GPU hangs, and another to ensure that writes done without aux - * will be visible to reads done with aux. + /* Initialize the aux buffers to enable correct rendering. In order to + * ensure that things such as storage images work correctly, aux buffers + * need to be initialized to valid data. + * + * Having an aux buffer with invalid data is a problem for two reasons: + * + * 1) Having an invalid value in the buffer can confuse the hardware. + * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is + * invalid and leads to the hardware doing strange things. It + * doesn't hang as far as we can tell but rendering corruption can + * occur. * - * Having an aux buffer with invalid data is possible for CCS buffers - * SKL+ and for MCS buffers with certain sample counts (2x and 8x). One - * easy way to get to a valid state is to fast-clear the specified range. + * 2) If this transition is into the GENERAL layout and we then use the + * image as a storage image, then we must have the aux buffer in the + * pass-through state so that, if we then go to texture from the + * image, we get the results of our storage image writes and not the + * fast clear color or other random data. * - * Even for MCS buffers that have sample counts that don't require - * certain bits to be reserved (4x and 8x), we're unsure if the hardware - * will be okay with the sample mappings given by the undefined buffer. - * We don't have any data to show that this is a problem, but we want to - * avoid causing difficult-to-debug problems. + * For CCS both of the problems above are real demonstrable issues. In + * that case, the only thing we can do is to perform an ambiguate to + * transition the aux surface into the pass-through state. + * + * For MCS, (2) is never an issue because we don't support multisampled + * storage images. In theory, issue (1) is a problem with MCS but we've + * never seen it in the wild. For 4x and 16x, all bit patters could, in + * theory, be interpreted as something but we don't know that all bit + * patterns are actually valid. For 2x and 8x, you could easily end up + * with the MCS referring to an invalid plane because not all bits of + * the MCS value are actually used. Even though we've never seen issues + * in the wild, it's best to play it safe and initialize the MCS. We + * can use a fast-clear for MCS because we only ever touch from render + * and texture (no image load store). */ - if ((GEN_GEN >= 9 && image->samples == 1) || image->samples > 1) { + if (image->samples == 1) { + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); + if (base_layer >= aux_layers) + break; /* We will only get fewer layers as level increases */ + uint32_t level_layer_count = + MIN2(layer_count, aux_layers - base_layer); + + anv_image_ccs_op(cmd_buffer, image, aspect, level, + base_layer, level_layer_count, + ISL_AUX_OP_AMBIGUATE, false); + + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { + set_image_compressed_bit(cmd_buffer, image, aspect, + level, base_layer, level_layer_count, + false); + } + } + } else { if (image->samples == 4 || image->samples == 16) { anv_perf_warn(cmd_buffer->device->instance, image, "Doing a potentially unnecessary fast-clear to " "define an MCS buffer."); } - anv_image_fast_clear(cmd_buffer, image, aspect, - base_level, level_count, - base_layer, layer_count); + assert(base_level == 0 && level_count == 1); + anv_image_mcs_op(cmd_buffer, image, aspect, + base_layer, layer_count, + ISL_AUX_OP_FAST_CLEAR, false); } - /* At this point, some elements of the CCS buffer may have the fast-clear - * bit-arrangement. As the user writes to a subresource, we need to have - * the associated CCS elements enter the ambiguated state. This enables - * reads (implicit or explicit) to reflect the user-written data instead - * of the clear color. The only time such elements will not change their - * state as described above, is in a final layout that doesn't have CCS - * enabled. In this case, we must force the associated CCS buffers of the - * specified range to enter the ambiguated state in advance. - */ - if (image->samples == 1 && - image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E && - final_layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { - /* The CCS_D buffer may not be enabled in the final layout. Continue - * executing this function to perform a resolve. - */ - anv_perf_warn(cmd_buffer->device->instance, image, - "Performing an additional resolve for CCS_D layout " - "transition. Consider always leaving it on or " - "performing an ambiguation pass."); - } else { - /* Writes in the final layout will be aware of the auxiliary buffer. - * In addition, the clear buffer entries and the auxiliary buffers - * have been populated with values that will result in correct - * rendering. - */ - return; - } - } else if (initial_layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { - /* Resolves are only necessary if the subresource may contain blocks - * fast-cleared to values unsupported in other layouts. This only occurs - * if the initial layout is COLOR_ATTACHMENT_OPTIMAL. - */ - return; - } else if (image->samples > 1) { - /* MCS buffers don't need resolving. */ return; } + const enum isl_aux_usage initial_aux_usage = + anv_layout_to_aux_usage(devinfo, image, aspect, initial_layout); + const enum isl_aux_usage final_aux_usage = + anv_layout_to_aux_usage(devinfo, image, aspect, final_layout); + + /* The current code assumes that there is no mixing of CCS_E and CCS_D. + * We can handle transitions between CCS_D/E to and from NONE. What we + * don't yet handle is switching between CCS_E and CCS_D within a given + * image. Doing so in a performant way requires more detailed aux state + * tracking such as what is done in i965. For now, just assume that we + * only have one type of compression. + */ + assert(initial_aux_usage == ISL_AUX_USAGE_NONE || + final_aux_usage == ISL_AUX_USAGE_NONE || + initial_aux_usage == final_aux_usage); + + /* If initial aux usage is NONE, there is nothing to resolve */ + if (initial_aux_usage == ISL_AUX_USAGE_NONE) + return; + + enum isl_aux_op resolve_op = ISL_AUX_OP_NONE; + + /* If the initial layout supports more fast clear than the final layout + * then we need at least a partial resolve. + */ + const enum anv_fast_clear_type initial_fast_clear = + anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout); + const enum anv_fast_clear_type final_fast_clear = + anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout); + if (final_fast_clear < initial_fast_clear) + resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE; + + if (initial_aux_usage == ISL_AUX_USAGE_CCS_E && + final_aux_usage != ISL_AUX_USAGE_CCS_E) + resolve_op = ISL_AUX_OP_FULL_RESOLVE; + + if (resolve_op == ISL_AUX_OP_NONE) + return; + /* Perform a resolve to synchronize data between the main and aux buffer. * Before we begin, we must satisfy the cache flushing requirement specified * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)": @@ -751,21 +1035,21 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; - for (uint32_t level = base_level; level < last_level_num; level++) { - - /* The number of layers changes at each 3D miplevel. */ - if (image->type == VK_IMAGE_TYPE_3D) { - layer_count = MIN2(layer_count, anv_image_aux_layers(image, aspect, level)); - } - - genX(load_needs_resolve_predicate)(cmd_buffer, image, aspect, level); + for (uint32_t l = 0; l < level_count; l++) { + uint32_t level = base_level + l; - anv_ccs_resolve(cmd_buffer, image, aspect, level, base_layer, layer_count, - image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E ? - BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL : - BLORP_FAST_CLEAR_OP_RESOLVE_FULL); + uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); + if (base_layer >= aux_layers) + break; /* We will only get fewer layers as level increases */ + uint32_t level_layer_count = + MIN2(layer_count, aux_layers - base_layer); - genX(set_image_needs_resolve)(cmd_buffer, image, aspect, level, false); + for (uint32_t a = 0; a < level_layer_count; a++) { + uint32_t array_layer = base_layer + a; + anv_cmd_predicated_ccs_resolve(cmd_buffer, image, aspect, + level, array_layer, resolve_op, + final_fast_clear); + } } cmd_buffer->state.pending_pipe_bits |= @@ -850,26 +1134,36 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, struct anv_render_pass_attachment *att = &pass->attachments[i]; VkImageAspectFlags att_aspects = vk_format_aspects(att->format); VkImageAspectFlags clear_aspects = 0; + VkImageAspectFlags load_aspects = 0; if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { /* color attachment */ if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; + } else if (att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { + load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; } } else { /* depthstencil attachment */ - if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && - att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { - clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + } else if (att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { + load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + } } - if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && - att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { - clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + if (att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + } else if (att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { + load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + } } } state->attachments[i].current_layout = att->initial_layout; state->attachments[i].pending_clear_aspects = clear_aspects; + state->attachments[i].pending_load_aspects = load_aspects; if (clear_aspects) state->attachments[i].clear_value = begin->pClearValues[i]; @@ -898,12 +1192,9 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, add_image_view_relocs(cmd_buffer, iview, 0, state->attachments[i].color); } else { - /* This field will be initialized after the first subpass - * transition. - */ - state->attachments[i].aux_usage = ISL_AUX_USAGE_NONE; - - state->attachments[i].input_aux_usage = ISL_AUX_USAGE_NONE; + depth_stencil_attachment_compute_aux_usage(cmd_buffer->device, + state, i, + begin->renderArea); } if (need_input_attachment_state(&pass->attachments[i])) { @@ -975,8 +1266,7 @@ genX(BeginCommandBuffer)( * emit push constants again before any rendering operation. So we * flag them dirty here to make sure they get emitted. */ - if (GEN_GEN == 10) - cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; VkResult result = VK_SUCCESS; if (cmd_buffer->usage_flags & @@ -1074,8 +1364,7 @@ genX(EndCommandBuffer)( genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); - if (GEN_GEN == 10) - emit_isp_disable(cmd_buffer); + emit_isp_disable(cmd_buffer); anv_cmd_buffer_end_batch_buffer(cmd_buffer); @@ -1431,12 +1720,20 @@ void genX(CmdPipelineBarrier)( anv_image_expand_aspects(image, range->aspectMask); uint32_t aspect_bit; + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->extent.depth, range->baseMipLevel); + } else { + base_layer = range->baseArrayLayer; + layer_count = anv_get_layerCount(image, range); + } + anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, range->baseMipLevel, anv_get_levelCount(image, range), - range->baseArrayLayer, - anv_get_layerCount(image, range), + base_layer, layer_count, pImageMemoryBarriers[i].oldLayout, pImageMemoryBarriers[i].newLayout); } @@ -1772,6 +2069,26 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, out: anv_state_flush(cmd_buffer->device, *bt_state); +#if GEN_GEN >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Taget Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + * + * FINISHME: Currently we shuffle around the surface states in the binding + * table based on if they are getting used or not. So, we've to do below + * pipe control flush for every binding table upload. Make changes so + * that we do it only when we modify render target surface states. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.RenderTargetCacheFlushEnable = true; + pc.StallAtPixelScoreboard = true; + } +#endif + return VK_SUCCESS; } @@ -2344,23 +2661,6 @@ void genX(CmdDrawIndexed)( /* MI_MATH only exists on Haswell+ */ #if GEN_IS_HASWELL || GEN_GEN >= 8 -static uint32_t -mi_alu(uint32_t opcode, uint32_t op1, uint32_t op2) -{ - struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { - .ALUOpcode = opcode, - .Operand1 = op1, - .Operand2 = op2, - }; - - uint32_t dw; - GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); - - return dw; -} - -#define CS_GPR(n) (0x2600 + (n) * 8) - /* Emit dwords to multiply GPR0 by N */ static void build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) @@ -2988,79 +3288,54 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.hiz_enabled = info.hiz_usage == ISL_AUX_USAGE_HIZ; } - -/** - * @brief Perform any layout transitions required at the beginning and/or end - * of the current subpass for depth buffers. - * - * TODO: Consider preprocessing the attachment reference array at render pass - * create time to determine if no layout transition is needed at the - * beginning and/or end of each subpass. - * - * @param cmd_buffer The command buffer the transition is happening within. - * @param subpass_end If true, marks that the transition is happening at the - * end of the subpass. - */ static void -cmd_buffer_subpass_transition_layouts(struct anv_cmd_buffer * const cmd_buffer, - const bool subpass_end) +cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, + uint32_t subpass_id) { - /* We need a non-NULL command buffer. */ - assert(cmd_buffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_subpass *subpass = &cmd_state->pass->subpasses[subpass_id]; + cmd_state->subpass = subpass; - const struct anv_cmd_state * const cmd_state = &cmd_buffer->state; - const struct anv_subpass * const subpass = cmd_state->subpass; - - /* This function must be called within a subpass. */ - assert(subpass); + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; - /* If there are attachment references, the array shouldn't be NULL. + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. If the client asks for instancing, we need to use the + * Instance Data Step Rate to ensure that we repeat the client's + * per-instance data once for each view. Since this bit is in + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top + * of each subpass. */ - if (subpass->attachment_count > 0) - assert(subpass->attachments); - - /* Iterate over the array of attachment references. */ - for (const VkAttachmentReference *att_ref = subpass->attachments; - att_ref < subpass->attachments + subpass->attachment_count; att_ref++) { - - /* If the attachment is unused, we can't perform a layout transition. */ - if (att_ref->attachment == VK_ATTACHMENT_UNUSED) - continue; + if (GEN_GEN == 7) + cmd_buffer->state.gfx.vb_dirty |= ~0; - /* This attachment index shouldn't go out of bounds. */ - assert(att_ref->attachment < cmd_state->pass->attachment_count); + /* It is possible to start a render pass with an old pipeline. Because the + * render pass and subpass index are both baked into the pipeline, this is + * highly unlikely. In order to do so, it requires that you have a render + * pass with a single subpass and that you use that render pass twice + * back-to-back and use the same pipeline at the start of the second render + * pass as at the end of the first. In order to avoid unpredictable issues + * with this edge case, we just dirty the pipeline at the start of every + * subpass. + */ + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; - const struct anv_render_pass_attachment * const att_desc = - &cmd_state->pass->attachments[att_ref->attachment]; - struct anv_attachment_state * const att_state = - &cmd_buffer->state.attachments[att_ref->attachment]; + /* Accumulate any subpass flushes that need to happen before the subpass */ + cmd_buffer->state.pending_pipe_bits |= + cmd_buffer->state.pass->subpass_flushes[subpass_id]; - /* The attachment should not be used in a subpass after its last. */ - assert(att_desc->last_subpass_idx >= anv_get_subpass_id(cmd_state)); + VkRect2D render_area = cmd_buffer->state.render_area; + struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; - if (subpass_end && anv_get_subpass_id(cmd_state) < - att_desc->last_subpass_idx) { - /* We're calling this function on a buffer twice in one subpass and - * this is not the last use of the buffer. The layout should not have - * changed from the first call and no transition is necessary. - */ - assert(att_state->current_layout == att_ref->layout || - att_state->current_layout == - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); + for (uint32_t i = 0; i < subpass->attachment_count; ++i) { + const uint32_t a = subpass->attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) continue; - } - /* The attachment index must be less than the number of attachments - * within the framebuffer. - */ - assert(att_ref->attachment < cmd_state->framebuffer->attachment_count); + assert(a < cmd_state->pass->attachment_count); + struct anv_attachment_state *att_state = &cmd_state->attachments[a]; - const struct anv_image_view * const iview = - cmd_state->framebuffer->attachments[att_ref->attachment]; - const struct anv_image * const image = iview->image; - - /* Get the appropriate target layout for this attachment. */ - VkImageLayout target_layout; + struct anv_image_view *iview = fb->attachments[a]; + const struct anv_image *image = iview->image; /* A resolve is necessary before use as an input attachment if the clear * color or auxiliary buffer usage isn't supported by the sampler. @@ -3068,161 +3343,252 @@ cmd_buffer_subpass_transition_layouts(struct anv_cmd_buffer * const cmd_buffer, const bool input_needs_resolve = (att_state->fast_clear && !att_state->clear_color_is_zero_one) || att_state->input_aux_usage != att_state->aux_usage; - if (subpass_end) { - target_layout = att_desc->final_layout; - } else if (iview->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV && - !input_needs_resolve) { - /* Layout transitions before the final only help to enable sampling as - * an input attachment. If the input attachment supports sampling - * using the auxiliary surface, we can skip such transitions by making - * the target layout one that is CCS-aware. + + VkImageLayout target_layout; + if (iview->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV && + !input_needs_resolve) { + /* Layout transitions before the final only help to enable sampling + * as an input attachment. If the input attachment supports sampling + * using the auxiliary surface, we can skip such transitions by + * making the target layout one that is CCS-aware. */ target_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; } else { - target_layout = att_ref->layout; + target_layout = subpass->attachments[i].layout; } - /* Perform the layout transition. */ - if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } + + transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_layout, target_layout); + } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, att_state->current_layout, target_layout); att_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, image, VK_IMAGE_ASPECT_DEPTH_BIT, target_layout); - } else if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { - assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); - transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level, 1, - iview->planes[0].isl.base_array_layer, - iview->planes[0].isl.array_len, - att_state->current_layout, target_layout); } - att_state->current_layout = target_layout; - } -} - -/* Update the clear value dword(s) in surface state objects or the fast clear - * state buffer entry for the color attachments used in this subpass. - */ -static void -cmd_buffer_subpass_sync_fast_clear_values(struct anv_cmd_buffer *cmd_buffer) -{ - assert(cmd_buffer && cmd_buffer->state.subpass); - - const struct anv_cmd_state *state = &cmd_buffer->state; - - /* Iterate through every color attachment used in this subpass. */ - for (uint32_t i = 0; i < state->subpass->color_count; ++i) { - - /* The attachment should be one of the attachments described in the - * render pass and used in the subpass. - */ - const uint32_t a = state->subpass->color_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - assert(a < state->pass->attachment_count); + if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { + assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); - /* Store some information regarding this attachment. */ - const struct anv_attachment_state *att_state = &state->attachments[a]; - const struct anv_image_view *iview = state->framebuffer->attachments[a]; - const struct anv_render_pass_attachment *rp_att = - &state->pass->attachments[a]; + /* Multi-planar images are not supported as attachments */ + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + assert(image->n_planes == 1); + + uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer; + uint32_t clear_layer_count = fb->layers; + + if (att_state->fast_clear) { + /* We only support fast-clears on the first layer */ + assert(iview->planes[0].isl.base_level == 0); + assert(iview->planes[0].isl.base_array_layer == 0); + + anv_image_ccs_op(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, + 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, false); + base_clear_layer++; + clear_layer_count--; + + genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color.state, + image, VK_IMAGE_ASPECT_COLOR_BIT, + true /* copy from ss */); + + if (att_state->clear_color_is_zero) { + /* This image has the auxiliary buffer enabled. We can mark the + * subresource as not needing a resolve because the clear color + * will match what's in every RENDER_SURFACE_STATE object when + * it's being used for sampling. + */ + set_image_fast_clear_state(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + ANV_FAST_CLEAR_DEFAULT_VALUE); + } else { + set_image_fast_clear_state(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + ANV_FAST_CLEAR_ANY); + } + } - if (att_state->aux_usage == ISL_AUX_USAGE_NONE) - continue; + if (clear_layer_count > 0) { + assert(image->n_planes == 1); + anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, + att_state->aux_usage, + iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, + iview->planes[0].isl.base_level, + base_clear_layer, clear_layer_count, + render_area, + vk_to_isl_color(att_state->clear_value.color)); + } + } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)) { + if (att_state->fast_clear) { + /* We currently only support HiZ for single-layer images */ + if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + assert(iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ); + assert(iview->planes[0].isl.base_level == 0); + assert(iview->planes[0].isl.base_array_layer == 0); + assert(fb->layers == 1); + } - /* The fast clear state entry must be updated if a fast clear is going to - * happen. The surface state must be updated if the clear value from a - * prior fast clear may be needed. - */ - if (att_state->pending_clear_aspects && att_state->fast_clear) { - /* Update the fast clear state entry. */ - genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color.state, - iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level, - true /* copy from ss */); - - /* Fast-clears impact whether or not a resolve will be necessary. */ - if (iview->image->planes[0].aux_usage == ISL_AUX_USAGE_CCS_E && - att_state->clear_color_is_zero) { - /* This image always has the auxiliary buffer enabled. We can mark - * the subresource as not needing a resolve because the clear color - * will match what's in every RENDER_SURFACE_STATE object when it's - * being used for sampling. - */ - genX(set_image_needs_resolve)(cmd_buffer, iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level, - false); + anv_image_hiz_clear(cmd_buffer, image, + att_state->pending_clear_aspects, + iview->planes[0].isl.base_level, + iview->planes[0].isl.base_array_layer, + fb->layers, render_area, + att_state->clear_value.depthStencil.stencil); } else { - genX(set_image_needs_resolve)(cmd_buffer, iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, + anv_image_clear_depth_stencil(cmd_buffer, image, + att_state->pending_clear_aspects, + att_state->aux_usage, iview->planes[0].isl.base_level, - true); + iview->planes[0].isl.base_array_layer, + fb->layers, render_area, + att_state->clear_value.depthStencil.depth, + att_state->clear_value.depthStencil.stencil); + } + } else { + assert(att_state->pending_clear_aspects == 0); + } + + if ((att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && + image->planes[0].aux_surface.isl.size > 0 && + iview->planes[0].isl.base_level == 0 && + iview->planes[0].isl.base_array_layer == 0) { + if (att_state->aux_usage != ISL_AUX_USAGE_NONE) { + genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color.state, + image, VK_IMAGE_ASPECT_COLOR_BIT, + false /* copy to ss */); } - } else if (rp_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { - /* The attachment may have been fast-cleared in a previous render - * pass and the value is needed now. Update the surface state(s). - * - * TODO: Do this only once per render pass instead of every subpass. - */ - genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color.state, - iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level, - false /* copy to ss */); - if (need_input_attachment_state(rp_att) && + if (need_input_attachment_state(&cmd_state->pass->attachments[a]) && att_state->input_aux_usage != ISL_AUX_USAGE_NONE) { genX(copy_fast_clear_dwords)(cmd_buffer, att_state->input.state, - iview->image, - VK_IMAGE_ASPECT_COLOR_BIT, - iview->planes[0].isl.base_level, + image, VK_IMAGE_ASPECT_COLOR_BIT, false /* copy to ss */); } } + + if (subpass->attachments[i].usage == + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + /* We assume that if we're starting a subpass, we're going to do some + * rendering so we may end up with compressed data. + */ + genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + att_state->aux_usage, + iview->planes[0].isl.base_level, + iview->planes[0].isl.base_array_layer, + fb->layers); + } else if (subpass->attachments[i].usage == + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { + /* We may be writing depth or stencil so we need to mark the surface. + * Unfortunately, there's no way to know at this point whether the + * depth or stencil tests used will actually write to the surface. + * + * Even though stencil may be plane 1, it always shares a base_level + * with depth. + */ + const struct isl_view *ds_view = &iview->planes[0].isl; + if (iview->aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { + genX(cmd_buffer_mark_image_written)(cmd_buffer, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + att_state->aux_usage, + ds_view->base_level, + ds_view->base_array_layer, + fb->layers); + } + if (iview->aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { + /* Even though stencil may be plane 1, it always shares a + * base_level with depth. + */ + genX(cmd_buffer_mark_image_written)(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + ISL_AUX_USAGE_NONE, + ds_view->base_level, + ds_view->base_array_layer, + fb->layers); + } + } + + att_state->pending_clear_aspects = 0; + att_state->pending_load_aspects = 0; } -} + cmd_buffer_emit_depth_stencil(cmd_buffer); +} static void -genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, - struct anv_subpass *subpass) +cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) { - cmd_buffer->state.subpass = subpass; + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_subpass *subpass = cmd_state->subpass; + uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); - cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + anv_cmd_buffer_resolve_subpass(cmd_buffer); - /* Our implementation of VK_KHR_multiview uses instancing to draw the - * different views. If the client asks for instancing, we need to use the - * Instance Data Step Rate to ensure that we repeat the client's - * per-instance data once for each view. Since this bit is in - * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top - * of each subpass. - */ - if (GEN_GEN == 7) - cmd_buffer->state.gfx.vb_dirty |= ~0; + struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; + for (uint32_t i = 0; i < subpass->attachment_count; ++i) { + const uint32_t a = subpass->attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; - /* Perform transitions to the subpass layout before any writes have - * occurred. - */ - cmd_buffer_subpass_transition_layouts(cmd_buffer, false); - - /* Update clear values *after* performing automatic layout transitions. - * This ensures that transitions from the UNDEFINED layout have had a chance - * to populate the clear value buffer with the correct values for the - * LOAD_OP_LOAD loadOp and that the fast-clears will update the buffer - * without the aforementioned layout transition overwriting the fast-clear - * value. - */ - cmd_buffer_subpass_sync_fast_clear_values(cmd_buffer); + if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id) + continue; - cmd_buffer_emit_depth_stencil(cmd_buffer); + assert(a < cmd_state->pass->attachment_count); + struct anv_attachment_state *att_state = &cmd_state->attachments[a]; + struct anv_image_view *iview = fb->attachments[a]; + const struct anv_image *image = iview->image; + + /* Transition the image into the final layout for this render pass */ + VkImageLayout target_layout = + cmd_state->pass->attachments[a].final_layout; - anv_cmd_buffer_clear_subpass(cmd_buffer); + if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + + uint32_t base_layer, layer_count; + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(iview->image->extent.depth, + iview->planes[0].isl.base_level); + } else { + base_layer = iview->planes[0].isl.base_array_layer; + layer_count = fb->layers; + } + + transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, + iview->planes[0].isl.base_level, 1, + base_layer, layer_count, + att_state->current_layout, target_layout); + } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + transition_depth_buffer(cmd_buffer, image, + att_state->current_layout, target_layout); + } + } + + /* Accumulate any subpass flushes that need to happen after the subpass. + * Yes, they do get accumulated twice in the NextSubpass case but since + * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up + * ORing the bits in twice so it's harmless. + */ + cmd_buffer->state.pending_pipe_bits |= + cmd_buffer->state.pass->subpass_flushes[subpass_id + 1]; } void genX(CmdBeginRenderPass)( @@ -3248,10 +3614,7 @@ void genX(CmdBeginRenderPass)( genX(flush_pipeline_select_3d)(cmd_buffer); - genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses); - - cmd_buffer->state.pending_pipe_bits |= - cmd_buffer->state.pass->subpass_flushes[0]; + cmd_buffer_begin_subpass(cmd_buffer, 0); } void genX(CmdNextSubpass)( @@ -3265,17 +3628,9 @@ void genX(CmdNextSubpass)( assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); - anv_cmd_buffer_resolve_subpass(cmd_buffer); - - /* Perform transitions to the final layout after all writes have occurred. - */ - cmd_buffer_subpass_transition_layouts(cmd_buffer, true); - - genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1); - - uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); - cmd_buffer->state.pending_pipe_bits |= - cmd_buffer->state.pass->subpass_flushes[subpass_id]; + uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state); + cmd_buffer_end_subpass(cmd_buffer); + cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); } void genX(CmdEndRenderPass)( @@ -3286,14 +3641,7 @@ void genX(CmdEndRenderPass)( if (anv_batch_has_error(&cmd_buffer->batch)) return; - anv_cmd_buffer_resolve_subpass(cmd_buffer); - - /* Perform transitions to the final layout after all writes have occurred. - */ - cmd_buffer_subpass_transition_layouts(cmd_buffer, true); - - cmd_buffer->state.pending_pipe_bits |= - cmd_buffer->state.pass->subpass_flushes[cmd_buffer->state.pass->subpass_count]; + cmd_buffer_end_subpass(cmd_buffer); cmd_buffer->state.hiz_enabled = false;