X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2Fanv_blorp.c;h=459d57ec5754a1e7a11b8ff8a97418daa8c4e7e5;hb=b9400b7ecde42bd4a437829f338a3f349338a16b;hp=4b639e4a139f1edc2c3db07a400914df31c39d33;hpb=dae8e52030dc289562f5079a4049de7ff6261e8e;p=mesa.git diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index 4b639e4a139..459d57ec575 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -49,7 +49,7 @@ lookup_blorp_shader(struct blorp_context *blorp, return true; } -static void +static bool upload_blorp_shader(struct blorp_context *blorp, const void *key, uint32_t key_size, const void *kernel, uint32_t kernel_size, @@ -72,6 +72,9 @@ upload_blorp_shader(struct blorp_context *blorp, key, key_size, kernel, kernel_size, prog_data, prog_data_size, &bind_map); + if (!bin) + return false; + /* The cache already has a reference and it's not going anywhere so there * is no need to hold a second reference. */ @@ -79,6 +82,8 @@ upload_blorp_shader(struct blorp_context *blorp, *kernel_out = bin->kernel.offset; *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data; + + return true; } void @@ -106,6 +111,9 @@ anv_device_init_blorp(struct anv_device *device) case 9: device->blorp.exec = gen9_blorp_exec; break; + case 10: + device->blorp.exec = gen10_blorp_exec; + break; default: unreachable("Unknown hardware generation"); } @@ -128,6 +136,7 @@ get_blorp_surf_for_anv_buffer(struct anv_device *device, { const struct isl_format_layout *fmtl = isl_format_get_layout(format); + bool ok UNUSED; /* ASTC is the only format which doesn't support linear layouts. * Create an equivalently sized surface with ISL to get around this. @@ -150,20 +159,20 @@ get_blorp_surf_for_anv_buffer(struct anv_device *device, }, }; - isl_surf_init(&device->isl_dev, isl_surf, - .dim = ISL_SURF_DIM_2D, - .format = format, - .width = width, - .height = height, - .depth = 1, - .levels = 1, - .array_len = 1, - .samples = 1, - .min_pitch = row_pitch, - .usage = ISL_SURF_USAGE_TEXTURE_BIT | - ISL_SURF_USAGE_RENDER_TARGET_BIT, - .tiling_flags = ISL_TILING_LINEAR_BIT); - assert(isl_surf->row_pitch == row_pitch); + ok = isl_surf_init(&device->isl_dev, isl_surf, + .dim = ISL_SURF_DIM_2D, + .format = format, + .width = width, + .height = height, + .depth = 1, + .levels = 1, + .array_len = 1, + .samples = 1, + .row_pitch = row_pitch, + .usage = ISL_SURF_USAGE_TEXTURE_BIT | + ISL_SURF_USAGE_RENDER_TARGET_BIT, + .tiling_flags = ISL_TILING_LINEAR_BIT); + assert(ok); } static void @@ -172,7 +181,8 @@ get_blorp_surf_for_anv_image(const struct anv_image *image, enum isl_aux_usage aux_usage, struct blorp_surf *blorp_surf) { - if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT) + if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT || + aux_usage == ISL_AUX_USAGE_HIZ) aux_usage = ISL_AUX_USAGE_NONE; const struct anv_surface *surface = @@ -226,7 +236,8 @@ void anv_CmdCopyImage( layer_count = pRegions[r].extent.depth; } else { dst_base_layer = pRegions[r].dstSubresource.baseArrayLayer; - layer_count = pRegions[r].dstSubresource.layerCount; + layer_count = + anv_get_layerCount(dst_image, &pRegions[r].dstSubresource); } unsigned src_base_layer; @@ -234,7 +245,8 @@ void anv_CmdCopyImage( src_base_layer = pRegions[r].srcOffset.z; } else { src_base_layer = pRegions[r].srcSubresource.baseArrayLayer; - assert(pRegions[r].srcSubresource.layerCount == layer_count); + assert(layer_count == + anv_get_layerCount(src_image, &pRegions[r].srcSubresource)); } assert(pRegions[r].srcSubresource.aspectMask == @@ -306,7 +318,8 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer, anv_sanitize_image_extent(anv_image->type, pRegions[r].imageExtent); if (anv_image->type != VK_IMAGE_TYPE_3D) { image.offset.z = pRegions[r].imageSubresource.baseArrayLayer; - extent.depth = pRegions[r].imageSubresource.layerCount; + extent.depth = + anv_get_layerCount(anv_image, &pRegions[r].imageSubresource); } const enum isl_format buffer_format = @@ -460,7 +473,7 @@ void anv_CmdBlitImage( dst_end = pRegions[r].dstOffsets[1].z; } else { dst_start = dst_res->baseArrayLayer; - dst_end = dst_start + dst_res->layerCount; + dst_end = dst_start + anv_get_layerCount(dst_image, dst_res); } unsigned src_start, src_end; @@ -470,7 +483,7 @@ void anv_CmdBlitImage( src_end = pRegions[r].srcOffsets[1].z; } else { src_start = src_res->baseArrayLayer; - src_end = src_start + src_res->layerCount; + src_end = src_start + anv_get_layerCount(src_image, src_res); } bool flip_z = flip_coords(&src_start, &src_end, &dst_start, &dst_end); @@ -502,7 +515,8 @@ void anv_CmdBlitImage( blorp_blit(&batch, &src, src_res->mipLevel, src_z, src_format.isl_format, src_format.swizzle, &dst, dst_res->mipLevel, dst_z, - dst_format.isl_format, dst_format.swizzle, + dst_format.isl_format, + anv_swizzle_for_render(dst_format.swizzle), src_x0, src_y0, src_x1, src_y1, dst_x0, dst_y0, dst_x1, dst_y1, gl_filter, flip_x, flip_y); @@ -675,10 +689,15 @@ void anv_CmdUpdateBuffer( * little data at the top to build its linked list. */ const uint32_t max_update_size = - cmd_buffer->device->dynamic_state_block_pool.block_size - 64; + cmd_buffer->device->dynamic_state_pool.block_size - 64; assert(max_update_size < MAX_SURFACE_DIM * 4); + /* We're about to read data that was written from the CPU. Flush the + * texture cache so we don't get anything stale. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; + while (dataSize) { const uint32_t copy_size = MIN2(dataSize, max_update_size); @@ -687,12 +706,14 @@ void anv_CmdUpdateBuffer( memcpy(tmp_data.map, pData, copy_size); + anv_state_flush(cmd_buffer->device, tmp_data); + int bs = 16; bs = gcd_pow2_u64(bs, dstOffset); bs = gcd_pow2_u64(bs, copy_size); do_buffer_copy(&batch, - &cmd_buffer->device->dynamic_state_block_pool.bo, + &cmd_buffer->device->dynamic_state_pool.block_pool.bo, tmp_data.offset, dst_buffer->bo, dst_buffer->offset + dstOffset, copy_size / bs, 1, bs); @@ -720,11 +741,17 @@ void anv_CmdFillBuffer( struct blorp_batch batch; blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0); - if (fillSize == VK_WHOLE_SIZE) { - fillSize = dst_buffer->size - dstOffset; - /* Make sure fillSize is a multiple of 4 */ - fillSize &= ~3ull; - } + fillSize = anv_buffer_get_range(dst_buffer, dstOffset, fillSize); + + /* From the Vulkan spec: + * + * "size is the number of bytes to fill, and must be either a multiple + * of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of + * the buffer. If VK_WHOLE_SIZE is used and the remaining size of the + * buffer is not a multiple of 4, then the nearest smaller multiple is + * used." + */ + fillSize &= ~3ull; /* First, we compute the biggest format that can be used with the * given offsets and size. @@ -817,7 +844,7 @@ void anv_CmdClearColorImage( VK_IMAGE_ASPECT_COLOR_BIT, image->tiling); unsigned base_layer = pRanges[r].baseArrayLayer; - unsigned layer_count = pRanges[r].layerCount; + unsigned layer_count = anv_get_layerCount(image, &pRanges[r]); for (unsigned i = 0; i < anv_get_levelCount(image, &pRanges[r]); i++) { const unsigned level = pRanges[r].baseMipLevel + i; @@ -857,7 +884,7 @@ void anv_CmdClearDepthStencilImage( struct blorp_surf depth, stencil; if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_DEPTH_BIT, - image->aux_usage, &depth); + ISL_AUX_USAGE_NONE, &depth); } else { memset(&depth, 0, sizeof(depth)); } @@ -877,7 +904,7 @@ void anv_CmdClearDepthStencilImage( bool clear_stencil = pRanges[r].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT; unsigned base_layer = pRanges[r].baseArrayLayer; - unsigned layer_count = pRanges[r].layerCount; + unsigned layer_count = anv_get_layerCount(image, &pRanges[r]); for (unsigned i = 0; i < anv_get_levelCount(image, &pRanges[r]); i++) { const unsigned level = pRanges[r].baseMipLevel + i; @@ -899,44 +926,52 @@ void anv_CmdClearDepthStencilImage( blorp_batch_finish(&batch); } -struct anv_state +VkResult anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer, uint32_t num_entries, - uint32_t *state_offset) + uint32_t *state_offset, + struct anv_state *bt_state) { - struct anv_state bt_state = - anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries, - state_offset); - if (bt_state.map == NULL) { + *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries, + state_offset); + if (bt_state->map == NULL) { /* We ran out of space. Grab a new binding table block. */ VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); - assert(result == VK_SUCCESS); + if (result != VK_SUCCESS) + return result; /* Re-emit state base addresses so we get the new surface state base * address before we start emitting binding tables etc. */ anv_cmd_buffer_emit_state_base_address(cmd_buffer); - bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries, - state_offset); - assert(bt_state.map != NULL); + *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries, + state_offset); + assert(bt_state->map != NULL); } - return bt_state; + return VK_SUCCESS; } -static uint32_t +static VkResult binding_table_for_surface_state(struct anv_cmd_buffer *cmd_buffer, - struct anv_state surface_state) + struct anv_state surface_state, + uint32_t *bt_offset) { uint32_t state_offset; - struct anv_state bt_state = - anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, 1, &state_offset); + struct anv_state bt_state; + + VkResult result = + anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, 1, &state_offset, + &bt_state); + if (result != VK_SUCCESS) + return result; uint32_t *bt_map = bt_state.map; bt_map[0] = surface_state.offset + state_offset; - return bt_state.offset; + *bt_offset = bt_state.offset; + return VK_SUCCESS; } static void @@ -947,7 +982,7 @@ clear_color_attachment(struct anv_cmd_buffer *cmd_buffer, { const struct anv_subpass *subpass = cmd_buffer->state.subpass; const uint32_t color_att = attachment->colorAttachment; - const uint32_t att_idx = subpass->color_attachments[color_att]; + const uint32_t att_idx = subpass->color_attachments[color_att].attachment; if (att_idx == VK_ATTACHMENT_UNUSED) return; @@ -957,12 +992,35 @@ clear_color_attachment(struct anv_cmd_buffer *cmd_buffer, struct anv_attachment_state *att_state = &cmd_buffer->state.attachments[att_idx]; - uint32_t binding_table = - binding_table_for_surface_state(cmd_buffer, att_state->color_rt_state); + uint32_t binding_table; + VkResult result = + binding_table_for_surface_state(cmd_buffer, att_state->color_rt_state, + &binding_table); + if (result != VK_SUCCESS) + return; union isl_color_value clear_color = vk_to_isl_color(attachment->clearValue.color); + /* If multiview is enabled we ignore baseArrayLayer and layerCount */ + if (subpass->view_mask) { + uint32_t view_idx; + for_each_bit(view_idx, subpass->view_mask) { + for (uint32_t r = 0; r < rectCount; ++r) { + const VkOffset2D offset = pRects[r].rect.offset; + const VkExtent2D extent = pRects[r].rect.extent; + blorp_clear_attachments(batch, binding_table, + ISL_FORMAT_UNSUPPORTED, pass_att->samples, + view_idx, 1, + offset.x, offset.y, + offset.x + extent.width, + offset.y + extent.height, + true, clear_color, false, 0.0f, 0, 0); + } + } + return; + } + for (uint32_t r = 0; r < rectCount; ++r) { const VkOffset2D offset = pRects[r].rect.offset; const VkExtent2D extent = pRects[r].rect.extent; @@ -984,7 +1042,7 @@ clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer, { static const union isl_color_value color_value = { .u32 = { 0, } }; const struct anv_subpass *subpass = cmd_buffer->state.subpass; - const uint32_t att_idx = subpass->depth_stencil_attachment; + const uint32_t att_idx = subpass->depth_stencil_attachment.attachment; if (att_idx == VK_ATTACHMENT_UNUSED) return; @@ -1003,9 +1061,35 @@ clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer, VK_IMAGE_TILING_OPTIMAL); } - uint32_t binding_table = + uint32_t binding_table; + VkResult result = binding_table_for_surface_state(cmd_buffer, - cmd_buffer->state.null_surface_state); + cmd_buffer->state.null_surface_state, + &binding_table); + if (result != VK_SUCCESS) + return; + + /* If multiview is enabled we ignore baseArrayLayer and layerCount */ + if (subpass->view_mask) { + uint32_t view_idx; + for_each_bit(view_idx, subpass->view_mask) { + for (uint32_t r = 0; r < rectCount; ++r) { + const VkOffset2D offset = pRects[r].rect.offset; + const VkExtent2D extent = pRects[r].rect.extent; + VkClearDepthStencilValue value = attachment->clearValue.depthStencil; + blorp_clear_attachments(batch, binding_table, + depth_format, pass_att->samples, + view_idx, 1, + offset.x, offset.y, + offset.x + extent.width, + offset.y + extent.height, + false, color_value, + clear_depth, value.depth, + clear_stencil ? 0xff : 0, value.stencil); + } + } + return; + } for (uint32_t r = 0; r < rectCount; ++r) { const VkOffset2D offset = pRects[r].rect.offset; @@ -1060,98 +1144,27 @@ enum subpass_stage { SUBPASS_STAGE_RESOLVE, }; -static bool -attachment_needs_flush(struct anv_cmd_buffer *cmd_buffer, - struct anv_render_pass_attachment *att, - enum subpass_stage stage) -{ - struct anv_render_pass *pass = cmd_buffer->state.pass; - struct anv_subpass *subpass = cmd_buffer->state.subpass; - unsigned subpass_idx = subpass - pass->subpasses; - assert(subpass_idx < pass->subpass_count); - - /* We handle this subpass specially based on the current stage */ - enum anv_subpass_usage usage = att->subpass_usage[subpass_idx]; - switch (stage) { - case SUBPASS_STAGE_LOAD: - if (usage & (ANV_SUBPASS_USAGE_INPUT | ANV_SUBPASS_USAGE_RESOLVE_SRC)) - return true; - break; - - case SUBPASS_STAGE_DRAW: - if (usage & ANV_SUBPASS_USAGE_RESOLVE_SRC) - return true; - break; - - default: - break; - } - - for (uint32_t s = subpass_idx + 1; s < pass->subpass_count; s++) { - usage = att->subpass_usage[s]; - - /* If this attachment is going to be used as an input in this or any - * future subpass, then we need to flush its cache and invalidate the - * texture cache. - */ - if (att->subpass_usage[s] & ANV_SUBPASS_USAGE_INPUT) - return true; - - if (usage & (ANV_SUBPASS_USAGE_DRAW | ANV_SUBPASS_USAGE_RESOLVE_DST)) { - /* We found another subpass that draws to this attachment. We'll - * wait to resolve until then. - */ - return false; - } - } - - return false; -} - -static void -anv_cmd_buffer_flush_attachments(struct anv_cmd_buffer *cmd_buffer, - enum subpass_stage stage) -{ - struct anv_subpass *subpass = cmd_buffer->state.subpass; - struct anv_render_pass *pass = cmd_buffer->state.pass; - - for (uint32_t i = 0; i < subpass->color_count; ++i) { - uint32_t att = subpass->color_attachments[i]; - assert(att < pass->attachment_count); - if (attachment_needs_flush(cmd_buffer, &pass->attachments[att], stage)) { - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; - } - } - - if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) { - uint32_t att = subpass->depth_stencil_attachment; - assert(att < pass->attachment_count); - if (attachment_needs_flush(cmd_buffer, &pass->attachments[att], stage)) { - cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; - } - } -} - static bool subpass_needs_clear(const struct anv_cmd_buffer *cmd_buffer) { const struct anv_cmd_state *cmd_state = &cmd_buffer->state; - uint32_t ds = cmd_state->subpass->depth_stencil_attachment; + uint32_t ds = cmd_state->subpass->depth_stencil_attachment.attachment; for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) { - uint32_t a = cmd_state->subpass->color_attachments[i]; + uint32_t a = cmd_state->subpass->color_attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + assert(a < cmd_state->pass->attachment_count); if (cmd_state->attachments[a].pending_clear_aspects) { return true; } } - if (ds != VK_ATTACHMENT_UNUSED && - cmd_state->attachments[ds].pending_clear_aspects) { - return true; + if (ds != VK_ATTACHMENT_UNUSED) { + assert(ds < cmd_state->pass->attachment_count); + if (cmd_state->attachments[ds].pending_clear_aspects) + return true; } return false; @@ -1161,6 +1174,8 @@ void anv_cmd_buffer_clear_subpass(struct anv_cmd_buffer *cmd_buffer) { const struct anv_cmd_state *cmd_state = &cmd_buffer->state; + const VkRect2D render_area = cmd_buffer->state.render_area; + if (!subpass_needs_clear(cmd_buffer)) return; @@ -1180,7 +1195,11 @@ anv_cmd_buffer_clear_subpass(struct anv_cmd_buffer *cmd_buffer) struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) { - const uint32_t a = cmd_state->subpass->color_attachments[i]; + const uint32_t a = cmd_state->subpass->color_attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + assert(a < cmd_state->pass->attachment_count); struct anv_attachment_state *att_state = &cmd_state->attachments[a]; if (!att_state->pending_clear_aspects) @@ -1194,20 +1213,53 @@ anv_cmd_buffer_clear_subpass(struct anv_cmd_buffer *cmd_buffer) get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT, att_state->aux_usage, &surf); - const VkRect2D render_area = cmd_buffer->state.render_area; + if (att_state->fast_clear) { + surf.clear_color = vk_to_isl_color(att_state->clear_value.color); + + /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear": + * + * "After Render target fast clear, pipe-control with color cache + * write-flush must be issued before sending any DRAW commands on + * that render target." + * + * This comment is a bit cryptic and doesn't really tell you what's + * going or what's really needed. It appears that fast clear ops are + * not properly synchronized with other drawing. This means that we + * cannot have a fast clear operation in the pipe at the same time as + * other regular drawing operations. We need to use a PIPE_CONTROL + * to ensure that the contents of the previous draw hit the render + * target before we resolve and then use a second PIPE_CONTROL after + * the resolve to ensure that it is completed before any additional + * drawing occurs. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + + blorp_fast_clear(&batch, &surf, iview->isl.format, + iview->isl.base_level, + iview->isl.base_array_layer, fb->layers, + render_area.offset.x, render_area.offset.y, + render_area.offset.x + render_area.extent.width, + render_area.offset.y + render_area.extent.height); - blorp_clear(&batch, &surf, iview->isl.format, iview->isl.swizzle, - iview->isl.base_level, - iview->isl.base_array_layer, fb->layers, - render_area.offset.x, render_area.offset.y, - render_area.offset.x + render_area.extent.width, - render_area.offset.y + render_area.extent.height, - vk_to_isl_color(att_state->clear_value.color), NULL); + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + } else { + blorp_clear(&batch, &surf, iview->isl.format, + anv_swizzle_for_render(iview->isl.swizzle), + iview->isl.base_level, + iview->isl.base_array_layer, fb->layers, + render_area.offset.x, render_area.offset.y, + render_area.offset.x + render_area.extent.width, + render_area.offset.y + render_area.extent.height, + vk_to_isl_color(att_state->clear_value.color), NULL); + } att_state->pending_clear_aspects = 0; } - const uint32_t ds = cmd_state->subpass->depth_stencil_attachment; + const uint32_t ds = cmd_state->subpass->depth_stencil_attachment.attachment; + assert(ds == VK_ATTACHMENT_UNUSED || ds < cmd_state->pass->attachment_count); if (ds != VK_ATTACHMENT_UNUSED && cmd_state->attachments[ds].pending_clear_aspects) { @@ -1217,15 +1269,91 @@ anv_cmd_buffer_clear_subpass(struct anv_cmd_buffer *cmd_buffer) .clearValue = cmd_state->attachments[ds].clear_value, }; - clear_depth_stencil_attachment(cmd_buffer, &batch, - &clear_att, 1, &clear_rect); + + const uint8_t gen = cmd_buffer->device->info.gen; + bool clear_with_hiz = gen >= 8 && cmd_state->attachments[ds].aux_usage == + ISL_AUX_USAGE_HIZ; + const struct anv_image_view *iview = fb->attachments[ds]; + + if (clear_with_hiz) { + const bool clear_depth = clear_att.aspectMask & + VK_IMAGE_ASPECT_DEPTH_BIT; + const bool clear_stencil = clear_att.aspectMask & + VK_IMAGE_ASPECT_STENCIL_BIT; + + /* Check against restrictions for depth buffer clearing. A great GPU + * performance benefit isn't expected when using the HZ sequence for + * stencil-only clears. Therefore, we don't emit a HZ op sequence for + * a stencil clear in addition to using the BLORP-fallback for depth. + */ + if (clear_depth) { + if (!blorp_can_hiz_clear_depth(gen, iview->isl.format, + iview->image->samples, + render_area.offset.x, + render_area.offset.y, + render_area.offset.x + + render_area.extent.width, + render_area.offset.y + + render_area.extent.height)) { + clear_with_hiz = false; + } else if (clear_att.clearValue.depthStencil.depth != + ANV_HZ_FC_VAL) { + /* Don't enable fast depth clears for any color not equal to + * ANV_HZ_FC_VAL. + */ + clear_with_hiz = false; + } else if (gen == 8 && + anv_can_sample_with_hiz(&cmd_buffer->device->info, + iview->aspect_mask, + iview->image->samples)) { + /* Only gen9+ supports returning ANV_HZ_FC_VAL when sampling a + * fast-cleared portion of a HiZ buffer. Testing has revealed + * that Gen8 only supports returning 0.0f. Gens prior to gen8 do + * not support this feature at all. + */ + clear_with_hiz = false; + } + } + + if (clear_with_hiz) { + blorp_gen8_hiz_clear_attachments(&batch, iview->image->samples, + render_area.offset.x, + render_area.offset.y, + render_area.offset.x + + render_area.extent.width, + render_area.offset.y + + render_area.extent.height, + clear_depth, clear_stencil, + clear_att.clearValue. + depthStencil.stencil); + + /* From the SKL PRM, Depth Buffer Clear: + * + * Depth Buffer Clear Workaround + * Depth buffer clear pass using any of the methods (WM_STATE, + * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a + * PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits + * “set” before starting to render. DepthStall and DepthFlush are + * not needed between consecutive depth clear passes nor is it + * required if the depth-clear pass was done with “full_surf_clear” + * bit set in the 3DSTATE_WM_HZ_OP. + */ + if (clear_depth) { + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_DEPTH_STALL_BIT; + } + } + } + + if (!clear_with_hiz) { + clear_depth_stencil_attachment(cmd_buffer, &batch, + &clear_att, 1, &clear_rect); + } cmd_state->attachments[ds].pending_clear_aspects = 0; } blorp_batch_finish(&batch); - - anv_cmd_buffer_flush_attachments(cmd_buffer, SUBPASS_STAGE_LOAD); } static void @@ -1283,10 +1411,11 @@ void anv_CmdResolveImage( for (uint32_t r = 0; r < regionCount; r++) { assert(pRegions[r].srcSubresource.aspectMask == pRegions[r].dstSubresource.aspectMask); - assert(pRegions[r].srcSubresource.layerCount == - pRegions[r].dstSubresource.layerCount); + assert(anv_get_layerCount(src_image, &pRegions[r].srcSubresource) == + anv_get_layerCount(dst_image, &pRegions[r].dstSubresource)); - const uint32_t layer_count = pRegions[r].dstSubresource.layerCount; + const uint32_t layer_count = + anv_get_layerCount(dst_image, &pRegions[r].dstSubresource); for (uint32_t layer = 0; layer < layer_count; layer++) { resolve_image(&batch, @@ -1304,37 +1433,101 @@ void anv_CmdResolveImage( blorp_batch_finish(&batch); } +void +anv_image_ccs_clear(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + const uint32_t base_level, const uint32_t level_count, + const uint32_t base_layer, uint32_t layer_count) +{ + assert(image->type == VK_IMAGE_TYPE_3D || image->extent.depth == 1); + + if (image->type == VK_IMAGE_TYPE_3D) { + assert(base_layer == 0); + assert(layer_count == anv_minify(image->extent.depth, base_level)); + } + + struct blorp_batch batch; + blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT, + image->aux_usage, &surf); + + /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear": + * + * "After Render target fast clear, pipe-control with color cache + * write-flush must be issued before sending any DRAW commands on + * that render target." + * + * This comment is a bit cryptic and doesn't really tell you what's going + * or what's really needed. It appears that fast clear ops are not + * properly synchronized with other drawing. This means that we cannot + * have a fast clear operation in the pipe at the same time as other + * regular drawing operations. We need to use a PIPE_CONTROL to ensure + * that the contents of the previous draw hit the render target before we + * resolve and then use a second PIPE_CONTROL after the resolve to ensure + * that it is completed before any additional drawing occurs. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + const VkExtent3D extent = { + .width = anv_minify(image->extent.width, level), + .height = anv_minify(image->extent.height, level), + .depth = anv_minify(image->extent.depth, level), + }; + + if (image->type == VK_IMAGE_TYPE_3D) + layer_count = extent.depth; + + assert(level < anv_image_aux_levels(image)); + assert(base_layer + layer_count <= anv_image_aux_layers(image, level)); + blorp_fast_clear(&batch, &surf, surf.surf->format, + level, base_layer, layer_count, + 0, 0, extent.width, extent.height); + } + + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; +} + static void ccs_resolve_attachment(struct anv_cmd_buffer *cmd_buffer, - struct blorp_batch *batch, uint32_t att) { struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; struct anv_attachment_state *att_state = &cmd_buffer->state.attachments[att]; - assert(att_state->aux_usage != ISL_AUX_USAGE_CCS_D); - if (att_state->aux_usage != ISL_AUX_USAGE_CCS_E) + if (att_state->aux_usage == ISL_AUX_USAGE_NONE || + att_state->aux_usage == ISL_AUX_USAGE_MCS) return; /* Nothing to resolve */ + assert(att_state->aux_usage == ISL_AUX_USAGE_CCS_E || + att_state->aux_usage == ISL_AUX_USAGE_CCS_D); + struct anv_render_pass *pass = cmd_buffer->state.pass; - struct anv_subpass *subpass = cmd_buffer->state.subpass; - unsigned subpass_idx = subpass - pass->subpasses; - assert(subpass_idx < pass->subpass_count); + const uint32_t subpass_idx = anv_get_subpass_id(&cmd_buffer->state); /* Scan forward to see what all ways this attachment will be used. * Ideally, we would like to resolve in the same subpass as the last write * of a particular attachment. That way we only resolve once but it's * still hot in the cache. */ + bool found_draw = false; + enum anv_subpass_usage usage = 0; for (uint32_t s = subpass_idx + 1; s < pass->subpass_count; s++) { - enum anv_subpass_usage usage = pass->attachments[att].subpass_usage[s]; + usage |= pass->attachments[att].subpass_usage[s]; if (usage & (ANV_SUBPASS_USAGE_DRAW | ANV_SUBPASS_USAGE_RESOLVE_DST)) { /* We found another subpass that draws to this attachment. We'll * wait to resolve until then. */ - return; + found_draw = true; + break; } } @@ -1342,12 +1535,61 @@ ccs_resolve_attachment(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image = iview->image; assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); - if (image->aux_usage == ISL_AUX_USAGE_CCS_E) - return; + enum blorp_fast_clear_op resolve_op = BLORP_FAST_CLEAR_OP_NONE; + if (!found_draw) { + /* This is the last subpass that writes to this attachment so we need to + * resolve here. Ideally, we would like to only resolve if the storeOp + * is set to VK_ATTACHMENT_STORE_OP_STORE. However, we need to ensure + * that the CCS bits are set to "resolved" because there may be copy or + * blit operations (which may ignore CCS) between now and the next time + * we render and we need to ensure that anything they write will be + * respected in the next render. Unfortunately, the hardware does not + * provide us with any sort of "invalidate" pass that sets the CCS to + * "resolved" without writing to the render target. + */ + if (iview->image->aux_usage != ISL_AUX_USAGE_CCS_E) { + /* The image destination surface doesn't support compression outside + * the render pass. We need a full resolve. + */ + resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL; + } else if (att_state->fast_clear) { + /* We don't know what to do with clear colors outside the render + * pass. We need a partial resolve. Only transparent black is + * built into the surface state object and thus no resolve is + * required for this case. + */ + if (att_state->clear_value.color.uint32[0] || + att_state->clear_value.color.uint32[1] || + att_state->clear_value.color.uint32[2] || + att_state->clear_value.color.uint32[3]) + resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL; + } else { + /* The image "natively" supports all the compression we care about + * and we don't need to resolve at all. If this is the case, we also + * don't need to resolve for any of the input attachment cases below. + */ + } + } else if (usage & ANV_SUBPASS_USAGE_INPUT) { + /* Input attachments are clear-color aware so, at least on Sky Lake, we + * can frequently sample from them with no resolves at all. + */ + if (att_state->aux_usage != att_state->input_aux_usage) { + assert(att_state->input_aux_usage == ISL_AUX_USAGE_NONE); + resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL; + } else if (!att_state->clear_color_is_zero_one) { + /* Sky Lake PRM, Vol. 2d, RENDER_SURFACE_STATE::Red Clear Color: + * + * "If Number of Multisamples is MULTISAMPLECOUNT_1 AND if this RT + * is fast cleared with non-0/1 clear value, this RT must be + * partially resolved (refer to Partial Resolve operation) before + * binding this surface to Sampler." + */ + resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL; + } + } - struct blorp_surf surf; - get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT, - att_state->aux_usage, &surf); + if (resolve_op == BLORP_FAST_CLEAR_OP_NONE) + return; /* From the Sky Lake PRM Vol. 7, "Render Target Resolve": * @@ -1364,16 +1606,16 @@ ccs_resolve_attachment(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; - for (uint32_t layer = 0; layer < fb->layers; layer++) { - blorp_ccs_resolve(batch, &surf, - iview->isl.base_level, - iview->isl.base_array_layer + layer, - iview->isl.format, - BLORP_FAST_CLEAR_OP_RESOLVE_FULL); - } + anv_ccs_resolve(cmd_buffer, att_state->color_rt_state, image, + iview->isl.base_level, fb->layers, resolve_op); cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + + /* Once we've done any sort of resolve, we're no longer fast-cleared */ + att_state->fast_clear = false; + if (att_state->aux_usage == ISL_AUX_USAGE_CCS_D) + att_state->aux_usage = ISL_AUX_USAGE_NONE; } void @@ -1383,24 +1625,34 @@ anv_cmd_buffer_resolve_subpass(struct anv_cmd_buffer *cmd_buffer) struct anv_subpass *subpass = cmd_buffer->state.subpass; - struct blorp_batch batch; - blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0); - for (uint32_t i = 0; i < subpass->color_count; ++i) { - ccs_resolve_attachment(cmd_buffer, &batch, - subpass->color_attachments[i]); - } + const uint32_t att = subpass->color_attachments[i].attachment; + if (att == VK_ATTACHMENT_UNUSED) + continue; - anv_cmd_buffer_flush_attachments(cmd_buffer, SUBPASS_STAGE_DRAW); + assert(att < cmd_buffer->state.pass->attachment_count); + ccs_resolve_attachment(cmd_buffer, att); + } if (subpass->has_resolve) { + /* We are about to do some MSAA resolves. We need to flush so that the + * result of writes to the MSAA color attachments show up in the sampler + * when we blit to the single-sampled resolve target. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + for (uint32_t i = 0; i < subpass->color_count; ++i) { - uint32_t src_att = subpass->color_attachments[i]; - uint32_t dst_att = subpass->resolve_attachments[i]; + uint32_t src_att = subpass->color_attachments[i].attachment; + uint32_t dst_att = subpass->resolve_attachments[i].attachment; if (dst_att == VK_ATTACHMENT_UNUSED) continue; + assert(src_att < cmd_buffer->state.pass->attachment_count); + assert(dst_att < cmd_buffer->state.pass->attachment_count); + if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) { /* From the Vulkan 1.0 spec: * @@ -1418,6 +1670,10 @@ anv_cmd_buffer_resolve_subpass(struct anv_cmd_buffer *cmd_buffer) const VkRect2D render_area = cmd_buffer->state.render_area; assert(src_iview->aspect_mask == dst_iview->aspect_mask); + + struct blorp_batch batch; + blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0); + resolve_image(&batch, src_iview->image, src_iview->isl.base_level, src_iview->isl.base_array_layer, @@ -1429,11 +1685,86 @@ anv_cmd_buffer_resolve_subpass(struct anv_cmd_buffer *cmd_buffer) render_area.offset.x, render_area.offset.y, render_area.extent.width, render_area.extent.height); - ccs_resolve_attachment(cmd_buffer, &batch, dst_att); - } + blorp_batch_finish(&batch); - anv_cmd_buffer_flush_attachments(cmd_buffer, SUBPASS_STAGE_RESOLVE); + ccs_resolve_attachment(cmd_buffer, dst_att); + } } +} + +void +anv_gen8_hiz_op_resolve(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + enum blorp_hiz_op op) +{ + assert(image); + + /* Don't resolve depth buffers without an auxiliary HiZ buffer and + * don't perform such a resolve on gens that don't support it. + */ + if (cmd_buffer->device->info.gen < 8 || + image->aux_usage != ISL_AUX_USAGE_HIZ) + return; + + assert(op == BLORP_HIZ_OP_HIZ_RESOLVE || + op == BLORP_HIZ_OP_DEPTH_RESOLVE); + + struct blorp_batch batch; + blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_DEPTH_BIT, + ISL_AUX_USAGE_NONE, &surf); + + /* Manually add the aux HiZ surf */ + surf.aux_surf = &image->aux_surface.isl, + surf.aux_addr = (struct blorp_address) { + .buffer = image->bo, + .offset = image->offset + image->aux_surface.offset, + }; + surf.aux_usage = ISL_AUX_USAGE_HIZ; + + surf.clear_color.f32[0] = ANV_HZ_FC_VAL; + + blorp_hiz_op(&batch, &surf, 0, 0, 1, op); + blorp_batch_finish(&batch); +} + +void +anv_ccs_resolve(struct anv_cmd_buffer * const cmd_buffer, + const struct anv_state surface_state, + const struct anv_image * const image, + const uint8_t level, const uint32_t layer_count, + const enum blorp_fast_clear_op op) +{ + assert(cmd_buffer && image); + + /* The resolved subresource range must have a CCS buffer. */ + assert(level < anv_image_aux_levels(image)); + assert(layer_count <= anv_image_aux_layers(image, level)); + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT && image->samples == 1); + + /* Create a binding table for this surface state. */ + uint32_t binding_table; + VkResult result = + binding_table_for_surface_state(cmd_buffer, surface_state, + &binding_table); + if (result != VK_SUCCESS) + return; + + struct blorp_batch batch; + blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT, + image->aux_usage == ISL_AUX_USAGE_CCS_E ? + ISL_AUX_USAGE_CCS_E : ISL_AUX_USAGE_CCS_D, + &surf); + + blorp_ccs_resolve_attachment(&batch, binding_table, &surf, level, + layer_count, image->color_surface.isl.format, + op); + blorp_batch_finish(&batch); }