From 67b1163f9fb56ca5e006f7b918bf1e97896b889c Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Wed, 17 Jun 2020 23:11:05 -0400 Subject: [PATCH] turnip: add support for D32_SFLOAT_S8_UINT Add support for D32_SFLOAT_S8_UINT, which requires special handling because it is actually two images. Signed-off-by: Jonathan Marek Part-of: --- src/freedreno/vulkan/tu_clear_blit.c | 293 +++++++++++++++++++-------- src/freedreno/vulkan/tu_cmd_buffer.c | 34 +++- src/freedreno/vulkan/tu_formats.c | 8 +- src/freedreno/vulkan/tu_image.c | 35 +++- src/freedreno/vulkan/tu_pass.c | 55 ++++- src/freedreno/vulkan/tu_private.h | 15 ++ src/freedreno/vulkan/tu_util.h | 1 + 7 files changed, 344 insertions(+), 97 deletions(-) diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index 6e1bd8193f4..fe045c581cd 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -228,6 +228,17 @@ r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) tu_cs_image_flag_ref(cs, iview, layer); } +static void +r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) +{ + assert(iview->image->samples == 1); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); + tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS); + tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); + tu_cs_emit(cs, iview->stencil_PITCH); +} + static void r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) { @@ -681,6 +692,19 @@ r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled)); } +static void +r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) +{ + tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */ + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6); + tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO)); + tu_cs_image_stencil_ref(cs, iview, layer); + tu_cs_emit(cs, 0); + + tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); +} + static void r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) { @@ -885,6 +909,11 @@ copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer) return format; case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: return VK_FORMAT_R32_UINT; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) + return VK_FORMAT_S8_UINT; + assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT); + return VK_FORMAT_D32_SFLOAT; } } @@ -1640,14 +1669,15 @@ static void clear_image(struct tu_cmd_buffer *cmd, struct tu_image *image, const VkClearValue *clear_value, - const VkImageSubresourceRange *range) + const VkImageSubresourceRange *range, + VkImageAspectFlags aspect_mask) { uint32_t level_count = tu_get_levelCount(image, range); uint32_t layer_count = tu_get_layerCount(image, range); struct tu_cs *cs = &cmd->cs; VkFormat format = image->vk_format; - if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) - format = VK_FORMAT_R32_UINT; + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + format = copy_format(format, aspect_mask, false); if (image->type == VK_IMAGE_TYPE_3D) { assert(layer_count == 1); @@ -1656,8 +1686,11 @@ clear_image(struct tu_cmd_buffer *cmd, const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops; - ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true, image->layout[0].ubwc); - ops->clear_value(cs, image->vk_format, clear_value); + ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc); + if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value); + else + ops->clear_value(cs, format, clear_value); for (unsigned j = 0; j < level_count; j++) { if (image->type == VK_IMAGE_TYPE_3D) @@ -1670,7 +1703,7 @@ clear_image(struct tu_cmd_buffer *cmd, struct tu_image_view dst; tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) { - .aspectMask = range->aspectMask, + .aspectMask = aspect_mask, .mipLevel = range->baseMipLevel + j, .baseArrayLayer = range->baseArrayLayer, .layerCount = 1, @@ -1697,7 +1730,7 @@ tu_CmdClearColorImage(VkCommandBuffer commandBuffer, tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); for (unsigned i = 0; i < rangeCount; i++) - clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i); + clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); } void @@ -1713,8 +1746,19 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); - for (unsigned i = 0; i < rangeCount; i++) - clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i); + for (unsigned i = 0; i < rangeCount; i++) { + const VkImageSubresourceRange *range = &pRanges[i]; + + if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + /* can't clear both depth and stencil at once, split up the aspect mask */ + uint32_t b; + for_each_bit(b, range->aspectMask) + clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b)); + continue; + } + + clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); + } } static void @@ -1906,29 +1950,26 @@ pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_v } static void -tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t attachment, - VkImageAspectFlags mask, - const VkClearValue *value) +clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat format, + uint8_t clear_mask, + uint32_t gmem_offset, + const VkClearValue *value) { - VkFormat vk_format = cmd->state.pass->attachments[attachment].format; - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); - tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format))); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format))); - tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, - .clear_mask = aspect_write_mask(vk_format, mask))); + tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask)); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset); + tu_cs_emit(cs, gmem_offset); tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); tu_cs_emit(cs, 0); uint32_t clear_vals[4] = {}; - pack_gmem_clear_value(value, vk_format, clear_vals); + pack_gmem_clear_value(value, format, clear_vals); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); tu_cs_emit_array(cs, clear_vals, 4); @@ -1936,6 +1977,27 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, tu6_emit_event_write(cmd, cs, BLIT); } +static void +tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t attachment, + VkImageAspectFlags mask, + const VkClearValue *value) +{ + const struct tu_render_pass_attachment *att = + &cmd->state.pass->attachments[attachment]; + + if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) + clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value); + if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) + clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value); + return; + } + + clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value); +} + static void tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, @@ -1997,35 +2059,65 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, tu_cond_exec_end(cs); } +static void +clear_sysmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat format, + VkImageAspectFlags clear_mask, + const VkRenderPassBeginInfo *info, + uint32_t a, + bool separate_stencil) +{ + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct blit_ops *ops = &r2d_ops; + if (cmd->state.pass->attachments[a].samples > 1) + ops = &r3d_ops; + + ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled); + ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent); + ops->clear_value(cs, format, &info->pClearValues[a]); + + for (uint32_t i = 0; i < fb->layers; i++) { + if (separate_stencil) { + if (ops == &r3d_ops) + r3d_dst_stencil(cs, iview, i); + else + r2d_dst_stencil(cs, iview, i); + } else { + ops->dst(cs, iview, i); + } + ops->run(cmd, cs); + } +} + void tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, const VkRenderPassBeginInfo *info) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_image_view *iview = fb->attachments[a].attachment; const struct tu_render_pass_attachment *attachment = &cmd->state.pass->attachments[a]; if (!attachment->clear_mask) return; - const struct blit_ops *ops = &r2d_ops; - if (attachment->samples > 1) - ops = &r3d_ops; - - ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0, - true, iview->ubwc_enabled); - ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent); - ops->clear_value(cs, attachment->format, &info->pClearValues[a]); - /* Wait for any flushes at the beginning of the renderpass to complete */ tu_cs_emit_wfi(cs); - for (uint32_t i = 0; i < fb->layers; i++) { - ops->dst(cs, iview, i); - ops->run(cmd, cs); + if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { + clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT, + info, a, false); + } + if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { + clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, + info, a, true); + } + } else { + clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask, + info, a, false); } /* The spec doesn't explicitly say, but presumably the initial renderpass @@ -2069,7 +2161,8 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, const struct tu_render_pass_attachment *attachment, - bool resolve) + bool resolve, + bool separate_stencil) { tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); @@ -2081,14 +2174,23 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, .integer = vk_format_is_int(attachment->format))); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4); - tu_cs_emit(cs, iview->RB_BLIT_DST_INFO); - tu_cs_image_ref_2d(cs, iview, 0, false); + if (separate_stencil) { + tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS); + tu_cs_emit_qw(cs, iview->stencil_base_addr); + tu_cs_emit(cs, iview->stencil_PITCH); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3); - tu_cs_image_flag_ref(cs, iview, 0); + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil)); + } else { + tu_cs_emit(cs, iview->RB_BLIT_DST_INFO); + tu_cs_image_ref_2d(cs, iview, 0, false); - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3); + tu_cs_image_flag_ref(cs, iview, 0); + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); + } tu6_emit_event_write(cmd, cs, BLIT); } @@ -2140,7 +2242,58 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, &cmd->state.pass->attachments[a]; if (attachment->load || force_load) - tu_emit_blit(cmd, cs, iview, attachment, false); + tu_emit_blit(cmd, cs, iview, attachment, false, false); + + if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load)) + tu_emit_blit(cmd, cs, iview, attachment, false, true); +} + +static void +store_cp_blit(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image_view *iview, + uint32_t samples, + bool separate_stencil, + VkFormat format, + uint32_t gmem_offset, + uint32_t cpp) +{ + r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, + iview->ubwc_enabled, true); + if (separate_stencil) + r2d_dst_stencil(cs, iview, 0); + else + r2d_dst(cs, iview, 0); + + tu_cs_emit_regs(cs, + A6XX_SP_PS_2D_SRC_INFO( + .color_format = tu6_format_texture(format, TILE6_2).fmt, + .tile_mode = TILE6_2, + .srgb = vk_format_is_srgb(format), + .samples = tu_msaa_samples(samples), + .samples_average = !vk_format_is_int(format), + .unk20 = 1, + .unk22 = 1), + /* note: src size does not matter when not scaling */ + A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), + A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset), + A6XX_SP_PS_2D_SRC_HI(), + A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp)); + + /* sync GMEM writes with CACHE. */ + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + + /* Wait for CACHE_INVALIDATE to land */ + tu_cs_emit_wfi(cs); + + tu_cs_emit_pkt7(cs, CP_BLIT, 1); + tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to + * sysmem, and we generally assume that GMEM renderpasses leave their + * results in sysmem, so we need to flush manually here. + */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); } void @@ -2149,13 +2302,12 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, uint32_t a, uint32_t gmem_a) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; const VkRect2D *render_area = &cmd->state.render_area; struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a]; - struct tu_image_view *iview = fb->attachments[a].attachment; + struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment; struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a]; - if (!dst->store) + if (!dst->store && !dst->store_stencil) return; uint32_t x1 = render_area->offset.x; @@ -2176,7 +2328,10 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, /* use fast path when render area is aligned, except for unsupported resolve cases */ if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) { - tu_emit_blit(cmd, cs, iview, src, true); + if (dst->store) + tu_emit_blit(cmd, cs, iview, src, true, false); + if (dst->store_stencil) + tu_emit_blit(cmd, cs, iview, src, true, true); return; } @@ -2188,38 +2343,18 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, return; } - r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT, - ROTATE_0, false, iview->ubwc_enabled, true); - r2d_dst(cs, iview, 0); r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); - tu_cs_emit_regs(cs, - A6XX_SP_PS_2D_SRC_INFO( - .color_format = tu6_format_texture(src->format, TILE6_2).fmt, - .tile_mode = TILE6_2, - .srgb = vk_format_is_srgb(src->format), - .samples = tu_msaa_samples(src->samples), - .samples_average = !vk_format_is_int(src->format), - .unk20 = 1, - .unk22 = 1), - /* note: src size does not matter when not scaling */ - A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), - A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset), - A6XX_SP_PS_2D_SRC_HI(), - A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp)); - - /* sync GMEM writes with CACHE. */ - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + VkFormat format = src->format; + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) + format = VK_FORMAT_D32_SFLOAT; - /* Wait for CACHE_INVALIDATE to land */ - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt7(cs, CP_BLIT, 1); - tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to - * sysmem, and we generally assume that GMEM renderpasses leave their - * results in sysmem, so we need to flush manually here. - */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + if (dst->store) { + store_cp_blit(cmd, cs, iview, src->samples, false, format, + src->gmem_offset, src->cpp); + } + if (dst->store_stencil) { + store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT, + src->gmem_offset_stencil, src->samples); + } } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index b7b43ba485f..eb3f69a6e80 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -290,11 +290,18 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd, A6XX_GRAS_LRZ_BUFFER_PITCH(0), A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0)); - if (attachment->format == VK_FORMAT_S8_UINT) { + if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT || + attachment->format == VK_FORMAT_S8_UINT) { + tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6); tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value); - tu_cs_image_ref(cs, iview, 0); - tu_cs_emit(cs, attachment->gmem_offset); + if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + tu_cs_image_stencil_ref(cs, iview, 0); + tu_cs_emit(cs, attachment->gmem_offset_stencil); + } else { + tu_cs_image_ref(cs, iview, 0); + tu_cs_emit(cs, attachment->gmem_offset); + } } else { tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0)); @@ -1053,7 +1060,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, * renderpass, this would avoid emitting both sysmem/gmem versions * * emit two texture descriptors for each input, as a workaround for - * d24s8, which can be sampled as both float (depth) and integer (stencil) + * d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil) * tu_shader lowers uint input attachment loads to use the 2nd descriptor * in the pair * TODO: a smarter workaround @@ -1077,6 +1084,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i]; + uint32_t gmem_offset = att->gmem_offset; + uint32_t cpp = att->cpp; memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4); @@ -1102,6 +1111,19 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, } } + if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK; + dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT); + dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK); + dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6); + dst[3] = 0; + dst[4] = iview->stencil_base_addr; + dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32; + + cpp = att->samples; + gmem_offset = att->gmem_offset_stencil; + } + if (!gmem) continue; @@ -1110,9 +1132,9 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); dst[2] = A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | - A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * att->cpp); + A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp); dst[3] = 0; - dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset; + dst[4] = cmd->device->physical_device->gmem_base + gmem_offset; dst[5] = A6XX_TEX_CONST_5_DEPTH(1); for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) dst[i] = 0; diff --git a/src/freedreno/vulkan/tu_formats.c b/src/freedreno/vulkan/tu_formats.c index 958cc9b407e..cdd1b046643 100644 --- a/src/freedreno/vulkan/tu_formats.c +++ b/src/freedreno/vulkan/tu_formats.c @@ -226,7 +226,7 @@ static const struct tu_native_format tu6_format_table[] = { TU6_xTC(S8_UINT, 8_UINT, WZYX), /* 127 */ TU6_xxx(D16_UNORM_S8_UINT, X8Z16_UNORM, WZYX), /* 128 */ TU6_xTC(D24_UNORM_S8_UINT, 8_8_8_8_UNORM, WZYX), /* 129 */ - TU6_xxx(D32_SFLOAT_S8_UINT, x, WZYX), /* 130 */ + TU6_xTC(D32_SFLOAT_S8_UINT, NONE, WZYX), /* 130 */ /* compressed */ TU6_xTx(BC1_RGB_UNORM_BLOCK, DXT1, WZYX), /* 131 */ @@ -449,6 +449,12 @@ tu_physical_device_get_format_properties( if (tu6_pipe2depth(format) != (enum a6xx_depth_format)~0) optimal |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT; + /* D32_SFLOAT_S8_UINT is tiled as two images, so no linear format + * blob enables some linear features, but its not useful, so don't bother. + */ + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) + linear = 0; + end: out_properties->linearTilingFeatures = linear; out_properties->optimalTilingFeatures = optimal; diff --git a/src/freedreno/vulkan/tu_image.c b/src/freedreno/vulkan/tu_image.c index 2e4a2449193..18c75e9f19f 100644 --- a/src/freedreno/vulkan/tu_image.c +++ b/src/freedreno/vulkan/tu_image.c @@ -43,6 +43,7 @@ tu6_plane_count(VkFormat format) default: return 1; case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: + case VK_FORMAT_D32_SFLOAT_S8_UINT: return 2; case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: return 3; @@ -58,13 +59,15 @@ tu6_plane_format(VkFormat format, uint32_t plane) return plane ? VK_FORMAT_R8G8_UNORM : VK_FORMAT_R8_UNORM; case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: return VK_FORMAT_R8_UNORM; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return plane ? VK_FORMAT_S8_UINT : VK_FORMAT_D32_SFLOAT; default: return format; } } static uint32_t -tu6_plane_index(VkImageAspectFlags aspect_mask) +tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask) { switch (aspect_mask) { default: @@ -73,6 +76,8 @@ tu6_plane_index(VkImageAspectFlags aspect_mask) return 1; case VK_IMAGE_ASPECT_PLANE_2_BIT: return 2; + case VK_IMAGE_ASPECT_STENCIL_BIT: + return format == VK_FORMAT_D32_SFLOAT_S8_UINT; } } @@ -228,6 +233,10 @@ tu_image_create(VkDevice _device, width0 = (width0 + 1) >> 1; height0 = (height0 + 1) >> 1; break; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + /* no UBWC for separate stencil */ + ubwc_enabled = false; + break; default: break; } @@ -372,6 +381,14 @@ tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t la tu_cs_emit_qw(cs, iview->base_addr + iview->layer_size * layer); } +void +tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) +{ + tu_cs_emit(cs, iview->stencil_PITCH); + tu_cs_emit(cs, iview->stencil_layer_size >> 6); + tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); +} + void tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src) { @@ -420,7 +437,8 @@ tu_image_view_init(struct tu_image_view *iview, memset(iview->descriptor, 0, sizeof(iview->descriptor)); - struct fdl_layout *layout = &image->layout[tu6_plane_index(aspect_mask)]; + struct fdl_layout *layout = + &image->layout[tu6_plane_index(image->vk_format, aspect_mask)]; uint32_t width = u_minify(layout->width0, range->baseMipLevel); uint32_t height = u_minify(layout->height0, range->baseMipLevel); @@ -447,6 +465,9 @@ tu_image_view_init(struct tu_image_view *iview, uint32_t ubwc_pitch = fdl_ubwc_pitch(layout, range->baseMipLevel); uint32_t layer_size = fdl_layer_stride(layout, range->baseMipLevel); + if (aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) + format = tu6_plane_format(format, tu6_plane_index(format, aspect_mask)); + struct tu_native_format fmt = tu6_format_texture(format, layout->tile_mode); /* note: freedreno layout assumes no TILE_ALL bit for non-UBWC * this means smaller mipmap levels have a linear tile mode @@ -642,6 +663,14 @@ tu_image_view_init(struct tu_image_view *iview, .color_format = cfmt.fmt, .color_swap = cfmt.swap, .flags = ubwc_enabled).value; + + if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + layout = &image->layout[1]; + iview->stencil_base_addr = image->bo->iova + image->bo_offset + + fdl_surface_offset(layout, range->baseMipLevel, range->baseArrayLayer); + iview->stencil_layer_size = fdl_layer_stride(layout, range->baseMipLevel); + iview->stencil_PITCH = A6XX_RB_STENCIL_BUFFER_PITCH(fdl_pitch(layout, range->baseMipLevel)).value; + } } VkResult @@ -720,7 +749,7 @@ tu_GetImageSubresourceLayout(VkDevice _device, TU_FROM_HANDLE(tu_image, image, _image); struct fdl_layout *layout = - &image->layout[tu6_plane_index(pSubresource->aspectMask)]; + &image->layout[tu6_plane_index(image->vk_format, pSubresource->aspectMask)]; const struct fdl_slice *slice = layout->slices + pSubresource->mipLevel; pLayout->offset = diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c index af5a190f80a..fdc3048fdd2 100644 --- a/src/freedreno/vulkan/tu_pass.c +++ b/src/freedreno/vulkan/tu_pass.c @@ -350,12 +350,20 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass, uint32_t cpp_total = 0; for (uint32_t i = 0; i < pass->attachment_count; i++) { struct tu_render_pass_attachment *att = &pass->attachments[i]; + bool cpp1 = (att->cpp == 1); if (att->gmem_offset >= 0) { cpp_total += att->cpp; + + /* take into account the separate stencil: */ + if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + cpp1 = (att->samples == 1); + cpp_total += att->samples; + } + /* texture pitch must be aligned to 64, use a tile_align_w that is * a multiple of 64 for cpp==1 attachment to work as input attachment */ - if (att->cpp == 1 && tile_align_w % 64 != 0) { + if (cpp1 && tile_align_w % 64 != 0) { tile_align_w *= 2; block_align_shift -= 1; } @@ -379,8 +387,8 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass, * optimal: nblocks = {13, 51}, pixels = 208896 */ uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align; - uint32_t offset = 0, pixels = ~0u; - for (uint32_t i = 0; i < pass->attachment_count; i++) { + uint32_t offset = 0, pixels = ~0u, i; + for (i = 0; i < pass->attachment_count; i++) { struct tu_render_pass_attachment *att = &pass->attachments[i]; if (att->gmem_offset < 0) continue; @@ -390,18 +398,33 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass, uint32_t align = MAX2(1, att->cpp >> block_align_shift); uint32_t nblocks = MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align); - if (nblocks > gmem_blocks) { - pixels = 0; + if (nblocks > gmem_blocks) break; - } gmem_blocks -= nblocks; cpp_total -= att->cpp; offset += nblocks * gmem_align; pixels = MIN2(pixels, nblocks * gmem_align / att->cpp); + + /* repeat the same for separate stencil */ + if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + att->gmem_offset_stencil = offset; + + /* note: for s8_uint, block align is always 1 */ + uint32_t nblocks = gmem_blocks * att->samples / cpp_total; + if (nblocks > gmem_blocks) + break; + + gmem_blocks -= nblocks; + cpp_total -= att->samples; + offset += nblocks * gmem_align; + pixels = MIN2(pixels, nblocks * gmem_align / att->samples); + } } - pass->gmem_pixels = pixels; + /* if the loop didn't complete then the gmem config is impossible */ + if (i == pass->attachment_count) + pass->gmem_pixels = pixels; } static void @@ -437,6 +460,16 @@ attachment_set_ops(struct tu_render_pass_attachment *att, att->load = stencil_load; att->store = stencil_store; break; + case VK_FORMAT_D32_SFLOAT_S8_UINT: /* separate stencil */ + if (att->clear_mask) + att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT; + if (stencil_clear) + att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT; + if (stencil_load) + att->load_stencil = true; + if (stencil_store) + att->store_stencil = true; + break; default: break; } @@ -600,7 +633,13 @@ tu_CreateRenderPass2(VkDevice _device, att->format = pCreateInfo->pAttachments[i].format; att->samples = pCreateInfo->pAttachments[i].samples; - att->cpp = vk_format_get_blocksize(att->format) * att->samples; + /* for d32s8, cpp is for the depth image, and + * att->samples will be used as the cpp for the stencil image + */ + if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) + att->cpp = 4 * att->samples; + else + att->cpp = vk_format_get_blocksize(att->format) * att->samples; att->gmem_offset = -1; attachment_set_ops(att, diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 90f0b31cac5..9bccc20b743 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -1349,6 +1349,11 @@ struct tu_image_view uint32_t RB_2D_DST_INFO; uint32_t RB_BLIT_DST_INFO; + + /* for d32s8 separate stencil */ + uint64_t stencil_base_addr; + uint32_t stencil_layer_size; + uint32_t stencil_PITCH; }; struct tu_sampler_ycbcr_conversion { @@ -1378,6 +1383,12 @@ tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t void tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); +void +tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); + +#define tu_image_view_stencil(iview, x) \ + ((iview->x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT)) + VkResult tu_image_create(VkDevice _device, const VkImageCreateInfo *pCreateInfo, @@ -1484,6 +1495,10 @@ struct tu_render_pass_attachment bool load; bool store; int32_t gmem_offset; + /* for D32S8 separate stencil: */ + bool load_stencil; + bool store_stencil; + int32_t gmem_offset_stencil; }; struct tu_render_pass diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h index fb914cf0ceb..e4dd3094fd3 100644 --- a/src/freedreno/vulkan/tu_util.h +++ b/src/freedreno/vulkan/tu_util.h @@ -227,6 +227,7 @@ tu6_pipe2depth(VkFormat format) case VK_FORMAT_D24_UNORM_S8_UINT: return DEPTH6_24_8; case VK_FORMAT_D32_SFLOAT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: case VK_FORMAT_S8_UINT: return DEPTH6_32; default: -- 2.30.2