From 2e084c2cb3699e846753b31bd63ed6cd18cd73f8 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Fri, 13 Mar 2020 11:57:23 -0400 Subject: [PATCH] turnip: new clear/blit implementation with shader path fallback The shader path is used to implement the following cases: * stencil aspect mask on D24S8 (for image_to_buffer,buffer_to_image) * clear/copy msaa destination (2D engine can't have msaa dest) Signed-off-by: Jonathan Marek Part-of: --- src/freedreno/registers/a6xx.xml | 7 +- src/freedreno/vulkan/meson.build | 8 +- src/freedreno/vulkan/tu_blit.c | 372 ---- src/freedreno/vulkan/tu_blit.h | 145 -- src/freedreno/vulkan/tu_clear_blit.c | 2390 ++++++++++++++++++++++++ src/freedreno/vulkan/tu_cmd_buffer.c | 447 +---- src/freedreno/vulkan/tu_formats.c | 387 +--- src/freedreno/vulkan/tu_image.c | 28 +- src/freedreno/vulkan/tu_meta_blit.c | 91 - src/freedreno/vulkan/tu_meta_buffer.c | 75 - src/freedreno/vulkan/tu_meta_clear.c | 238 --- src/freedreno/vulkan/tu_meta_copy.c | 215 --- src/freedreno/vulkan/tu_meta_resolve.c | 67 - src/freedreno/vulkan/tu_pass.c | 13 +- src/freedreno/vulkan/tu_private.h | 85 +- 15 files changed, 2578 insertions(+), 1990 deletions(-) delete mode 100644 src/freedreno/vulkan/tu_blit.c delete mode 100644 src/freedreno/vulkan/tu_blit.h create mode 100644 src/freedreno/vulkan/tu_clear_blit.c delete mode 100644 src/freedreno/vulkan/tu_meta_blit.c delete mode 100644 src/freedreno/vulkan/tu_meta_buffer.c delete mode 100644 src/freedreno/vulkan/tu_meta_clear.c delete mode 100644 src/freedreno/vulkan/tu_meta_copy.c delete mode 100644 src/freedreno/vulkan/tu_meta_resolve.c diff --git a/src/freedreno/registers/a6xx.xml b/src/freedreno/registers/a6xx.xml index 8dd86993747..ccd00fb5ff6 100644 --- a/src/freedreno/registers/a6xx.xml +++ b/src/freedreno/registers/a6xx.xml @@ -2383,24 +2383,27 @@ to upconvert to 32b float internally? - + + + + @@ -3120,12 +3123,14 @@ to upconvert to 32b float internally? + + diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index 47af6995595..2510b161c65 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -40,8 +40,7 @@ tu_extensions_c = custom_target( ) libtu_files = files( - 'tu_blit.c', - 'tu_blit.h', + 'tu_clear_blit.c', 'tu_cmd_buffer.c', 'tu_cs.c', 'tu_cs.h', @@ -52,11 +51,6 @@ libtu_files = files( 'tu_fence.c', 'tu_formats.c', 'tu_image.c', - 'tu_meta_blit.c', - 'tu_meta_buffer.c', - 'tu_meta_clear.c', - 'tu_meta_copy.c', - 'tu_meta_resolve.c', 'tu_pass.c', 'tu_pipeline.c', 'tu_pipeline_cache.c', diff --git a/src/freedreno/vulkan/tu_blit.c b/src/freedreno/vulkan/tu_blit.c deleted file mode 100644 index 469fb517dd6..00000000000 --- a/src/freedreno/vulkan/tu_blit.c +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright © 2019 Valve Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Authors: - * Jonathan Marek - * - */ - -#include "tu_blit.h" - -#include "a6xx.xml.h" -#include "adreno_common.xml.h" -#include "adreno_pm4.xml.h" - -#include "vk_format.h" - -#include "tu_cs.h" - -/* TODO: - * - Avoid disabling tiling for swapped formats - * (image_to_image copy doesn't deal with it) - * - Fix d24_unorm_s8_uint support & aspects - * - UBWC - */ - -static VkFormat -blit_copy_format(VkFormat format) -{ - switch (vk_format_get_blocksizebits(format)) { - case 8: return VK_FORMAT_R8_UINT; - case 16: return VK_FORMAT_R16_UINT; - case 32: return VK_FORMAT_R32_UINT; - case 64: return VK_FORMAT_R32G32_UINT; - case 96: return VK_FORMAT_R32G32B32_UINT; - case 128:return VK_FORMAT_R32G32B32A32_UINT; - default: - unreachable("unhandled format size"); - } -} - -static uint32_t -blit_image_info(const struct tu_blit_surf *img, struct tu_native_format fmt, bool stencil_read) -{ - if (fmt.fmt == FMT6_Z24_UNORM_S8_UINT) - fmt.fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - - if (stencil_read) - fmt.swap = XYZW; - - return A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt.fmt) | - A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(img->tile_mode) | - A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(fmt.swap) | - COND(vk_format_is_srgb(img->fmt), A6XX_SP_PS_2D_SRC_INFO_SRGB) | - COND(img->ubwc_size, A6XX_SP_PS_2D_SRC_INFO_FLAGS); -} - -static void -emit_blit_step(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs, - const struct tu_blit *blt) -{ - struct tu_physical_device *phys_dev = cmdbuf->device->physical_device; - - struct tu_native_format dfmt = tu6_format_color(blt->dst.fmt, blt->dst.image_tile_mode); - struct tu_native_format sfmt = tu6_format_texture(blt->src.fmt, blt->src.image_tile_mode); - - if (dfmt.fmt == FMT6_Z24_UNORM_S8_UINT) - dfmt.fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - - enum a6xx_2d_ifmt ifmt = tu6_fmt_to_ifmt(dfmt.fmt); - - if (vk_format_is_srgb(blt->dst.fmt)) { - assert(ifmt == R2D_UNORM8); - ifmt = R2D_UNORM8_SRGB; - } - - uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL_ROTATE(blt->rotation) | - COND(blt->type == TU_BLIT_CLEAR, A6XX_RB_2D_BLIT_CNTL_SOLID_COLOR) | - A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(dfmt.fmt) | /* not required? */ - COND(dfmt.fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8, - A6XX_RB_2D_BLIT_CNTL_D24S8) | - A6XX_RB_2D_BLIT_CNTL_MASK(0xf) | - A6XX_RB_2D_BLIT_CNTL_IFMT(ifmt); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); - tu_cs_emit(cs, blit_cntl); - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); - tu_cs_emit(cs, blit_cntl); - - /* - * Emit source: - */ - if (blt->type == TU_BLIT_CLEAR) { - tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); - tu_cs_emit(cs, blt->clear_value[0]); - tu_cs_emit(cs, blt->clear_value[1]); - tu_cs_emit(cs, blt->clear_value[2]); - tu_cs_emit(cs, blt->clear_value[3]); - } else { - tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 10); - tu_cs_emit(cs, blit_image_info(&blt->src, sfmt, blt->stencil_read) | - A6XX_SP_PS_2D_SRC_INFO_SAMPLES(tu_msaa_samples(blt->src.samples)) | - /* TODO: should disable this bit for integer formats ? */ - COND(blt->src.samples > 1, A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) | - COND(blt->filter, A6XX_SP_PS_2D_SRC_INFO_FILTER) | - 0x500000); - tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(blt->src.x + blt->src.width) | - A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(blt->src.y + blt->src.height)); - tu_cs_emit_qw(cs, blt->src.va); - tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_PITCH_PITCH(blt->src.pitch)); - - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - - if (blt->src.ubwc_size) { - tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 6); - tu_cs_emit_qw(cs, blt->src.ubwc_va); - tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_FLAGS_PITCH_PITCH(blt->src.ubwc_pitch) | - A6XX_SP_PS_2D_SRC_FLAGS_PITCH_ARRAY_PITCH(blt->src.ubwc_size >> 2)); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - } - } - - /* - * Emit destination: - */ - tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 9); - tu_cs_emit(cs, blit_image_info(&blt->dst, dfmt, false)); - tu_cs_emit_qw(cs, blt->dst.va); - tu_cs_emit(cs, A6XX_RB_2D_DST_SIZE_PITCH(blt->dst.pitch)); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - - if (blt->dst.ubwc_size) { - tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 6); - tu_cs_emit_qw(cs, blt->dst.ubwc_va); - tu_cs_emit(cs, A6XX_RB_2D_DST_FLAGS_PITCH_PITCH(blt->dst.ubwc_pitch) | - A6XX_RB_2D_DST_FLAGS_PITCH_ARRAY_PITCH(blt->dst.ubwc_size >> 2)); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - } - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - tu_cs_emit(cs, A6XX_GRAS_2D_SRC_TL_X_X(blt->src.x)); - tu_cs_emit(cs, A6XX_GRAS_2D_SRC_BR_X_X(blt->src.x + blt->src.width - 1)); - tu_cs_emit(cs, A6XX_GRAS_2D_SRC_TL_Y_Y(blt->src.y)); - tu_cs_emit(cs, A6XX_GRAS_2D_SRC_BR_Y_Y(blt->src.y + blt->src.height - 1)); - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_DST_TL, 2); - tu_cs_emit(cs, A6XX_GRAS_2D_DST_TL_X(blt->dst.x) | - A6XX_GRAS_2D_DST_TL_Y(blt->dst.y)); - tu_cs_emit(cs, A6XX_GRAS_2D_DST_BR_X(blt->dst.x + blt->dst.width - 1) | - A6XX_GRAS_2D_DST_BR_Y(blt->dst.y + blt->dst.height - 1)); - - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); - tu_cs_emit(cs, 0x3f); - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1); - tu_cs_emit(cs, 0); - - if (dfmt.fmt == FMT6_10_10_10_2_UNORM_DEST) - dfmt.fmt = FMT6_16_16_16_16_FLOAT; - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_2D_SRC_FORMAT, 1); - tu_cs_emit(cs, COND(vk_format_is_sint(blt->src.fmt), A6XX_SP_2D_SRC_FORMAT_SINT) | - COND(vk_format_is_uint(blt->src.fmt), A6XX_SP_2D_SRC_FORMAT_UINT) | - A6XX_SP_2D_SRC_FORMAT_COLOR_FORMAT(dfmt.fmt) | - COND(ifmt == R2D_UNORM8_SRGB, A6XX_SP_2D_SRC_FORMAT_SRGB) | - A6XX_SP_2D_SRC_FORMAT_MASK(0xf)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(cs, phys_dev->magic.RB_UNKNOWN_8E04_blit); - - tu_cs_emit_pkt7(cs, CP_BLIT, 1); - tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(cs, 0); -} - -void tu_blit(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs, - struct tu_blit *blt) -{ - struct tu_physical_device *phys_dev = cmdbuf->device->physical_device; - - switch (blt->type) { - case TU_BLIT_COPY: - blt->stencil_read = - blt->dst.fmt == VK_FORMAT_R8_UNORM && - blt->src.fmt == VK_FORMAT_D24_UNORM_S8_UINT; - - assert(vk_format_get_blocksize(blt->dst.fmt) == - vk_format_get_blocksize(blt->src.fmt) || blt->stencil_read); - assert(blt->src.samples == blt->dst.samples); - - if (vk_format_is_compressed(blt->src.fmt)) { - unsigned block_width = vk_format_get_blockwidth(blt->src.fmt); - unsigned block_height = vk_format_get_blockheight(blt->src.fmt); - - blt->src.pitch /= block_width; - blt->src.x /= block_width; - blt->src.y /= block_height; - blt->src.fmt = blit_copy_format(blt->src.fmt); - - /* for image_to_image copy, width/height is on the src format */ - blt->dst.width = blt->src.width = DIV_ROUND_UP(blt->src.width, block_width); - blt->dst.height = blt->src.height = DIV_ROUND_UP(blt->src.height, block_height); - } - - if (vk_format_is_compressed(blt->dst.fmt)) { - unsigned block_width = vk_format_get_blockwidth(blt->dst.fmt); - unsigned block_height = vk_format_get_blockheight(blt->dst.fmt); - - blt->dst.pitch /= block_width; - blt->dst.x /= block_width; - blt->dst.y /= block_height; - blt->dst.fmt = blit_copy_format(blt->dst.fmt); - } - - if (blt->dst.fmt == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) - blt->dst.fmt = blit_copy_format(blt->dst.fmt); - - if (blt->src.fmt == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) - blt->src.fmt = blit_copy_format(blt->src.fmt); - - /* TODO: multisample image copy does not work correctly with tiling/UBWC */ - blt->src.x *= blt->src.samples; - blt->dst.x *= blt->dst.samples; - blt->src.width *= blt->src.samples; - blt->dst.width *= blt->dst.samples; - blt->src.samples = 1; - blt->dst.samples = 1; - break; - case TU_BLIT_CLEAR: - /* unsupported format cleared as UINT32 */ - if (blt->dst.fmt == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) - blt->dst.fmt = VK_FORMAT_R32_UINT; - /* TODO: multisample image clearing also seems not to work with certain - * formats. The blob uses a shader-based clear in these cases. - */ - blt->dst.x *= blt->dst.samples; - blt->dst.width *= blt->dst.samples; - blt->dst.samples = 1; - blt->src = blt->dst; - break; - default: - assert(blt->dst.samples == 1); - } - - tu6_emit_event_write(cmdbuf, cs, LRZ_FLUSH, false); - tu6_emit_event_write(cmdbuf, cs, PC_CCU_FLUSH_COLOR_TS, true); - tu6_emit_event_write(cmdbuf, cs, PC_CCU_FLUSH_DEPTH_TS, true); - tu6_emit_event_write(cmdbuf, cs, PC_CCU_INVALIDATE_COLOR, false); - tu6_emit_event_write(cmdbuf, cs, PC_CCU_INVALIDATE_DEPTH, false); - - tu_cs_emit_wfi(cs); - tu_cs_emit_regs(cs, - A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass)); - - /* buffer copy setup */ - tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - - for (unsigned layer = 0; layer < blt->layers; layer++) { - if (blt->buffer) { - struct tu_blit line_blt = *blt; - uint64_t dst_va = line_blt.dst.va, src_va = line_blt.src.va; - unsigned blocksize = vk_format_get_blocksize(blt->src.fmt); - uint32_t size = line_blt.src.width, tmp; - - while (size) { - line_blt.src.x = (src_va & 63) / blocksize; - line_blt.src.va = src_va & ~63; - tmp = MIN2(size, 0x4000 - line_blt.src.x); - - line_blt.dst.x = (dst_va & 63) / blocksize; - line_blt.dst.va = dst_va & ~63; - tmp = MIN2(tmp, 0x4000 - line_blt.dst.x); - - line_blt.src.width = line_blt.dst.width = tmp; - - emit_blit_step(cmdbuf, cs, &line_blt); - - src_va += tmp * blocksize; - dst_va += tmp * blocksize; - size -= tmp; - } - } else if ((blt->src.va & 63) || (blt->src.pitch & 63)) { - /* per line copy path (buffer_to_image) */ - assert(blt->type == TU_BLIT_COPY && !blt->src.image_tile_mode); - struct tu_blit line_blt = *blt; - uint64_t src_va = line_blt.src.va + blt->src.pitch * blt->src.y; - - line_blt.src.y = 0; - line_blt.src.pitch = 0; - line_blt.src.height = 1; - line_blt.dst.height = 1; - - for (unsigned y = 0; y < blt->src.height; y++) { - line_blt.src.x = blt->src.x + (src_va & 63) / vk_format_get_blocksize(blt->src.fmt); - line_blt.src.va = src_va & ~63; - - emit_blit_step(cmdbuf, cs, &line_blt); - - line_blt.dst.y++; - src_va += blt->src.pitch; - } - } else if ((blt->dst.va & 63) || (blt->dst.pitch & 63)) { - /* per line copy path (image_to_buffer) */ - assert(blt->type == TU_BLIT_COPY && !blt->dst.image_tile_mode); - struct tu_blit line_blt = *blt; - uint64_t dst_va = line_blt.dst.va + blt->dst.pitch * blt->dst.y; - - line_blt.dst.y = 0; - line_blt.dst.pitch = 0; - line_blt.src.height = 1; - line_blt.dst.height = 1; - - for (unsigned y = 0; y < blt->src.height; y++) { - line_blt.dst.x = blt->dst.x + (dst_va & 63) / vk_format_get_blocksize(blt->dst.fmt); - line_blt.dst.va = dst_va & ~63; - - emit_blit_step(cmdbuf, cs, &line_blt); - - line_blt.src.y++; - dst_va += blt->dst.pitch; - } - } else { - emit_blit_step(cmdbuf, cs, blt); - } - blt->dst.va += blt->dst.layer_size; - blt->src.va += blt->src.layer_size; - blt->dst.ubwc_va += blt->dst.ubwc_size; - blt->src.ubwc_va += blt->src.ubwc_size; - } - - tu6_emit_event_write(cmdbuf, cs, PC_CCU_FLUSH_COLOR_TS, true); - tu6_emit_event_write(cmdbuf, cs, PC_CCU_FLUSH_DEPTH_TS, true); - tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS, true); - tu6_emit_event_write(cmdbuf, cs, CACHE_INVALIDATE, false); -} diff --git a/src/freedreno/vulkan/tu_blit.h b/src/freedreno/vulkan/tu_blit.h deleted file mode 100644 index 62851d74778..00000000000 --- a/src/freedreno/vulkan/tu_blit.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright © 2019 Valve Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Authors: - * Jonathan Marek - * - */ - -#ifndef TU_BLIT_H -#define TU_BLIT_H - -#include "tu_private.h" - -#include "vk_format.h" - -struct tu_blit_surf { - VkFormat fmt; - enum a6xx_tile_mode tile_mode; - enum a6xx_tile_mode image_tile_mode; - uint64_t va; - uint32_t pitch, layer_size; - uint32_t x, y; - uint32_t width, height; - unsigned samples; - uint64_t ubwc_va; - uint32_t ubwc_pitch; - uint32_t ubwc_size; -}; - -static inline struct tu_blit_surf -tu_blit_surf(struct tu_image *image, - VkImageSubresourceLayers subres, - const VkOffset3D *offsets) -{ - unsigned layer = subres.baseArrayLayer; - if (image->type == VK_IMAGE_TYPE_3D) { - assert(layer == 0); - layer = MIN2(offsets[0].z, offsets[1].z); - } - - return (struct tu_blit_surf) { - .fmt = image->vk_format, - .tile_mode = tu6_get_image_tile_mode(image, subres.mipLevel), - .image_tile_mode = image->layout.tile_mode, - .va = tu_image_base(image, subres.mipLevel, layer), - .pitch = tu_image_stride(image, subres.mipLevel), - .layer_size = tu_layer_size(image, subres.mipLevel), - .x = MIN2(offsets[0].x, offsets[1].x), - .y = MIN2(offsets[0].y, offsets[1].y), - .width = abs(offsets[1].x - offsets[0].x), - .height = abs(offsets[1].y - offsets[0].y), - .samples = image->samples, - .ubwc_va = tu_image_ubwc_base(image, subres.mipLevel, layer), - .ubwc_pitch = tu_image_ubwc_pitch(image, subres.mipLevel), - .ubwc_size = tu_image_ubwc_size(image, subres.mipLevel), - }; -} - -static inline struct tu_blit_surf -tu_blit_surf_ext(struct tu_image *image, - VkImageSubresourceLayers subres, - VkOffset3D offset, - VkExtent3D extent) -{ - return tu_blit_surf(image, subres, (VkOffset3D[]) { - offset, {.x = offset.x + extent.width, - .y = offset.y + extent.height, - .z = offset.z} - }); -} - -static inline struct tu_blit_surf -tu_blit_surf_whole(struct tu_image *image, int level, int layer) -{ - return tu_blit_surf(image, (VkImageSubresourceLayers){ - .mipLevel = level, - .baseArrayLayer = layer, - }, (VkOffset3D[]) { - {}, { - u_minify(image->extent.width, level), - u_minify(image->extent.height, level), - } - }); -} - -static inline struct tu_blit_surf -sysmem_attachment_surf(const struct tu_image_view *view, uint32_t base_layer, - const VkRect2D *rect) -{ - return tu_blit_surf_ext(view->image, (VkImageSubresourceLayers) { - .mipLevel = view->base_mip, - .baseArrayLayer = base_layer, - }, (VkOffset3D) { - .x = rect->offset.x, - .y = rect->offset.y, - .z = 0, - }, (VkExtent3D) { - .width = rect->extent.width, - .height = rect->extent.height, - .depth = 1, - }); -} - - -enum tu_blit_type { - TU_BLIT_DEFAULT, - TU_BLIT_COPY, - TU_BLIT_CLEAR, -}; - -struct tu_blit { - struct tu_blit_surf dst; - struct tu_blit_surf src; - uint32_t layers; - bool filter; - bool stencil_read; - bool buffer; /* 1d copy/clear */ - enum a6xx_rotation rotation; - uint32_t clear_value[4]; - enum tu_blit_type type; -}; - -void tu_blit(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs, - struct tu_blit *blt); - -#endif /* TU_BLIT_H */ diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c new file mode 100644 index 00000000000..35469bf363b --- /dev/null +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -0,0 +1,2390 @@ +/* + * Copyright 2019-2020 Valve Corporation + * SPDX-License-Identifier: MIT + * + * Authors: + * Jonathan Marek + */ + +#include "tu_private.h" + +#include "tu_cs.h" +#include "vk_format.h" + +#include "util/format_r11g11b10f.h" +#include "util/format_rgb9e5.h" +#include "util/format_srgb.h" +#include "util/u_half.h" + +/* helper functions previously in tu_formats.c */ + +static uint32_t +tu_pack_mask(int bits) +{ + assert(bits <= 32); + return (1ull << bits) - 1; +} + +static uint32_t +tu_pack_float32_for_unorm(float val, int bits) +{ + const uint32_t max = tu_pack_mask(bits); + if (val < 0.0f) + return 0; + else if (val > 1.0f) + return max; + else + return _mesa_lroundevenf(val * (float) max); +} + +static uint32_t +tu_pack_float32_for_snorm(float val, int bits) +{ + const int32_t max = tu_pack_mask(bits - 1); + int32_t tmp; + if (val < -1.0f) + tmp = -max; + else if (val > 1.0f) + tmp = max; + else + tmp = _mesa_lroundevenf(val * (float) max); + + return tmp & tu_pack_mask(bits); +} + +static uint32_t +tu_pack_float32_for_uscaled(float val, int bits) +{ + const uint32_t max = tu_pack_mask(bits); + if (val < 0.0f) + return 0; + else if (val > (float) max) + return max; + else + return (uint32_t) val; +} + +static uint32_t +tu_pack_float32_for_sscaled(float val, int bits) +{ + const int32_t max = tu_pack_mask(bits - 1); + const int32_t min = -max - 1; + int32_t tmp; + if (val < (float) min) + tmp = min; + else if (val > (float) max) + tmp = max; + else + tmp = (int32_t) val; + + return tmp & tu_pack_mask(bits); +} + +static uint32_t +tu_pack_uint32_for_uint(uint32_t val, int bits) +{ + return val & tu_pack_mask(bits); +} + +static uint32_t +tu_pack_int32_for_sint(int32_t val, int bits) +{ + return val & tu_pack_mask(bits); +} + +static uint32_t +tu_pack_float32_for_sfloat(float val, int bits) +{ + assert(bits == 16 || bits == 32); + return bits == 16 ? util_float_to_half(val) : fui(val); +} + +union tu_clear_component_value { + float float32; + int32_t int32; + uint32_t uint32; +}; + +static uint32_t +tu_pack_clear_component_value(union tu_clear_component_value val, + const struct util_format_channel_description *ch) +{ + uint32_t packed; + + switch (ch->type) { + case UTIL_FORMAT_TYPE_UNSIGNED: + /* normalized, scaled, or pure integer */ + if (ch->normalized) + packed = tu_pack_float32_for_unorm(val.float32, ch->size); + else if (ch->pure_integer) + packed = tu_pack_uint32_for_uint(val.uint32, ch->size); + else + packed = tu_pack_float32_for_uscaled(val.float32, ch->size); + break; + case UTIL_FORMAT_TYPE_SIGNED: + /* normalized, scaled, or pure integer */ + if (ch->normalized) + packed = tu_pack_float32_for_snorm(val.float32, ch->size); + else if (ch->pure_integer) + packed = tu_pack_int32_for_sint(val.int32, ch->size); + else + packed = tu_pack_float32_for_sscaled(val.float32, ch->size); + break; + case UTIL_FORMAT_TYPE_FLOAT: + packed = tu_pack_float32_for_sfloat(val.float32, ch->size); + break; + default: + unreachable("unexpected channel type"); + packed = 0; + break; + } + + assert((packed & tu_pack_mask(ch->size)) == packed); + return packed; +} + +static const struct util_format_channel_description * +tu_get_format_channel_description(const struct util_format_description *desc, + int comp) +{ + switch (desc->swizzle[comp]) { + case PIPE_SWIZZLE_X: + return &desc->channel[0]; + case PIPE_SWIZZLE_Y: + return &desc->channel[1]; + case PIPE_SWIZZLE_Z: + return &desc->channel[2]; + case PIPE_SWIZZLE_W: + return &desc->channel[3]; + default: + return NULL; + } +} + +static union tu_clear_component_value +tu_get_clear_component_value(const VkClearValue *val, int comp, + enum util_format_colorspace colorspace) +{ + assert(comp < 4); + + union tu_clear_component_value tmp; + switch (colorspace) { + case UTIL_FORMAT_COLORSPACE_ZS: + assert(comp < 2); + if (comp == 0) + tmp.float32 = val->depthStencil.depth; + else + tmp.uint32 = val->depthStencil.stencil; + break; + case UTIL_FORMAT_COLORSPACE_SRGB: + if (comp < 3) { + tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]); + break; + } + default: + assert(comp < 4); + tmp.uint32 = val->color.uint32[comp]; + break; + } + + return tmp; +} + +/* r2d_ = BLIT_OP_SCALE operations */ + +static enum a6xx_2d_ifmt +format_to_ifmt(enum a6xx_format fmt) +{ + switch (fmt) { + case FMT6_A8_UNORM: + case FMT6_8_UNORM: + case FMT6_8_SNORM: + case FMT6_8_8_UNORM: + case FMT6_8_8_SNORM: + case FMT6_8_8_8_8_UNORM: + case FMT6_8_8_8_X8_UNORM: + case FMT6_8_8_8_8_SNORM: + case FMT6_4_4_4_4_UNORM: + case FMT6_5_5_5_1_UNORM: + case FMT6_5_6_5_UNORM: + case FMT6_Z24_UNORM_S8_UINT: + case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: + return R2D_UNORM8; + + case FMT6_32_UINT: + case FMT6_32_SINT: + case FMT6_32_32_UINT: + case FMT6_32_32_SINT: + case FMT6_32_32_32_32_UINT: + case FMT6_32_32_32_32_SINT: + return R2D_INT32; + + case FMT6_16_UINT: + case FMT6_16_SINT: + case FMT6_16_16_UINT: + case FMT6_16_16_SINT: + case FMT6_16_16_16_16_UINT: + case FMT6_16_16_16_16_SINT: + case FMT6_10_10_10_2_UINT: + return R2D_INT16; + + case FMT6_8_UINT: + case FMT6_8_SINT: + case FMT6_8_8_UINT: + case FMT6_8_8_SINT: + case FMT6_8_8_8_8_UINT: + case FMT6_8_8_8_8_SINT: + return R2D_INT8; + + case FMT6_16_UNORM: + case FMT6_16_SNORM: + case FMT6_16_16_UNORM: + case FMT6_16_16_SNORM: + case FMT6_16_16_16_16_UNORM: + case FMT6_16_16_16_16_SNORM: + case FMT6_32_FLOAT: + case FMT6_32_32_FLOAT: + case FMT6_32_32_32_32_FLOAT: + return R2D_FLOAT32; + + case FMT6_16_FLOAT: + case FMT6_16_16_FLOAT: + case FMT6_16_16_16_16_FLOAT: + case FMT6_11_11_10_FLOAT: + case FMT6_10_10_10_2_UNORM: + case FMT6_10_10_10_2_UNORM_DEST: + return R2D_FLOAT16; + + default: + unreachable("bad format"); + return 0; + } +} + +static void +r2d_coords(struct tu_cs *cs, + const VkOffset2D *dst, + const VkOffset2D *src, + const VkExtent2D *extent) +{ + tu_cs_emit_regs(cs, + A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y), + A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1)); + + if (!src) + return; + + tu_cs_emit_regs(cs, + A6XX_GRAS_2D_SRC_TL_X(.x = src->x), + A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1), + A6XX_GRAS_2D_SRC_TL_Y(.y = src->y), + A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1)); +} + +static void +r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val) +{ + uint32_t clear_value[4] = {}; + + switch (format) { + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + /* cleared as r8g8b8a8_unorm using special format */ + clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24); + clear_value[1] = clear_value[0] >> 8; + clear_value[2] = clear_value[0] >> 16; + clear_value[3] = val->depthStencil.stencil; + break; + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D32_SFLOAT: + /* R2D_FLOAT32 */ + clear_value[0] = fui(val->depthStencil.depth); + break; + case VK_FORMAT_S8_UINT: + clear_value[0] = val->depthStencil.stencil; + break; + case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: + /* cleared as UINT32 */ + clear_value[0] = float3_to_rgb9e5(val->color.float32); + break; + default: + assert(!vk_format_is_depth_or_stencil(format)); + const struct util_format_description *desc = vk_format_description(format); + enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format)); + + assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN || + format == VK_FORMAT_B10G11R11_UFLOAT_PACK32)); + + for (unsigned i = 0; i < desc->nr_channels; i++) { + const struct util_format_channel_description *ch = &desc->channel[i]; + if (ifmt == R2D_UNORM8) { + float linear = val->color.float32[i]; + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3) + linear = util_format_linear_to_srgb_float(val->color.float32[i]); + + if (ch->type == UTIL_FORMAT_TYPE_SIGNED) + clear_value[i] = tu_pack_float32_for_snorm(linear, 8); + else + clear_value[i] = tu_pack_float32_for_unorm(linear, 8); + } else if (ifmt == R2D_FLOAT16) { + clear_value[i] = util_float_to_half(val->color.float32[i]); + } else { + assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 || + ifmt == R2D_INT16 || ifmt == R2D_INT8); + clear_value[i] = val->color.uint32[i]; + } + } + break; + } + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); + tu_cs_emit_array(cs, clear_value, 4); +} + +static void +r2d_src(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image *image, + VkFormat vk_format, + uint32_t level, + uint32_t layer, + bool linear_filter, + bool stencil_read) +{ + struct tu_native_format format = tu6_format_image_src(image, vk_format, level); + + /* stencil readout path fails with UBWC enabled (why?) */ + assert(!stencil_read || !image->layout.ubwc_layer_size); + + if (stencil_read) + format.swap = XYZW; + + tu_cs_emit_regs(cs, + A6XX_SP_PS_2D_SRC_INFO( + .color_format = format.fmt, + .tile_mode = format.tile_mode, + .color_swap = format.swap, + .flags = image->layout.ubwc_layer_size != 0, + .srgb = vk_format_is_srgb(vk_format), + .samples = tu_msaa_samples(image->samples), + .filter = linear_filter, + .samples_average = image->samples > 1 && + !vk_format_is_int(vk_format) && + !vk_format_is_depth_or_stencil(vk_format), + .unk20 = 1, + .unk22 = 1), + A6XX_SP_PS_2D_SRC_SIZE( + .width = tu_minify(image->extent.width, level), + .height = tu_minify(image->extent.height, level)), + A6XX_SP_PS_2D_SRC(tu_image_base_ref(image, level, layer)), + A6XX_SP_PS_2D_SRC_PITCH(.pitch = tu_image_pitch(image, level))); + + if (image->layout.ubwc_layer_size) { + tu_cs_emit_regs(cs, + A6XX_SP_PS_2D_SRC_FLAGS(tu_image_ubwc_base_ref(image, level, layer)), + A6XX_SP_PS_2D_SRC_FLAGS_PITCH(.pitch = tu_image_ubwc_pitch(image, level))); + } +} + +static void +r2d_src_buffer(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat vk_format, + uint64_t va, uint32_t pitch, + uint32_t width, uint32_t height) +{ + struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR); + + tu_cs_emit_regs(cs, + A6XX_SP_PS_2D_SRC_INFO( + .color_format = format.fmt, + .color_swap = format.swap, + .srgb = vk_format_is_srgb(vk_format), + .unk20 = 1, + .unk22 = 1), + A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height), + A6XX_SP_PS_2D_SRC_LO((uint32_t) va), + A6XX_SP_PS_2D_SRC_HI(va >> 32), + A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch)); +} + +static void +r2d_dst(struct tu_cs *cs, + struct tu_image *image, + VkFormat vk_format, + uint32_t level, + uint32_t layer) +{ + struct tu_native_format format = tu6_format_image(image, vk_format, level); + + assert(image->samples == 1); + + tu_cs_emit_regs(cs, + A6XX_RB_2D_DST_INFO( + .color_format = format.fmt, + .tile_mode = format.tile_mode, + .color_swap = format.swap, + .flags = image->layout.ubwc_layer_size != 0, + .srgb = vk_format_is_srgb(image->vk_format)), + A6XX_RB_2D_DST(tu_image_base_ref(image, level, layer)), + A6XX_RB_2D_DST_SIZE(.pitch = tu_image_pitch(image, level))); + + if (image->layout.ubwc_layer_size) { + tu_cs_emit_regs(cs, + A6XX_RB_2D_DST_FLAGS(tu_image_ubwc_base_ref(image, level, layer)), + A6XX_RB_2D_DST_FLAGS_PITCH(.pitch = tu_image_ubwc_pitch(image, level))); + } +} + +static void +r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) +{ + struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR); + + tu_cs_emit_regs(cs, + A6XX_RB_2D_DST_INFO( + .color_format = format.fmt, + .color_swap = format.swap, + .srgb = vk_format_is_srgb(vk_format)), + A6XX_RB_2D_DST_LO((uint32_t) va), + A6XX_RB_2D_DST_HI(va >> 32), + A6XX_RB_2D_DST_SIZE(.pitch = pitch)); +} + +static void +r2d_setup_common(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat vk_format, + enum a6xx_rotation rotation, + bool clear, + uint8_t mask, + bool scissor) +{ + enum a6xx_format format = tu6_base_format(vk_format); + enum a6xx_2d_ifmt ifmt = format_to_ifmt(format); + uint32_t unknown_8c01 = 0; + + if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) { + /* preserve depth channels */ + if (mask == 0x8) + unknown_8c01 = 0x00084001; + /* preserve stencil channel */ + if (mask == 0x7) + unknown_8c01 = 0x08000041; + } + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1); + tu_cs_emit(cs, unknown_8c01); + + uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL( + .scissor = scissor, + .rotate = rotation, + .solid_color = clear, + .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear, + .color_format = format, + .mask = 0xf, + .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt, + ).value; + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); + tu_cs_emit(cs, blit_cntl); + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); + tu_cs_emit(cs, blit_cntl); + + if (format == FMT6_10_10_10_2_UNORM_DEST) + format = FMT6_16_16_16_16_FLOAT; + + tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT( + .sint = vk_format_is_sint(vk_format), + .uint = vk_format_is_uint(vk_format), + .color_format = format, + .srgb = vk_format_is_srgb(vk_format), + .mask = 0xf)); +} + +static void +r2d_setup(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat vk_format, + enum a6xx_rotation rotation, + bool clear, + uint8_t mask) +{ + const struct tu_physical_device *phys_dev = cmd->device->physical_device; + + /* TODO: flushing with barriers instead of blindly always flushing */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false); + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + + tu_cs_emit_wfi(cs); + tu_cs_emit_regs(cs, + A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass)); + + r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false); +} + +static void +r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu_cs_emit_pkt7(cs, CP_BLIT, 1); + tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + /* TODO: flushing with barriers instead of blindly always flushing */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); +} + +/* r3d_ = shader path operations */ + +static void +r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts) +{ + static const instr_t vs_code[] = { + /* r0.xyz = r0.w ? c1.xyz : c0.xyz + * r1.xy = r0.w ? c1.zw : c0.zw + * r0.w = 1.0f + */ + { .cat3 = { + .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 2, .dst = 0, + .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1, + .src2 = 3, + .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}, + } }, + { .cat3 = { + .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 1, .dst = 4, + .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1, + .src2 = 3, + .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}, + } }, + { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, .dst = 3, + .src_im = 1, .fim_val = 1.0f } }, + { .cat0 = { .opc = OPC_END } }, + }; +#define FS_OFFSET (16 * sizeof(instr_t)) + STATIC_ASSERT(sizeof(vs_code) <= FS_OFFSET); + + /* vs inputs: only vtx id in r0.w */ + tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 7); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0xfcfcfc00 | A6XX_VFD_CONTROL_1_REGID4VTX(3)); + tu_cs_emit(cs, 0x0000fcfc); + tu_cs_emit(cs, 0xfcfcfcfc); + tu_cs_emit(cs, 0x000000fc); + tu_cs_emit(cs, 0x0000fcfc); + tu_cs_emit(cs, 0x00000000); + + /* vs outputs: position in r0.xyzw, blit coords in r1.xy */ + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4); + tu_cs_emit(cs, blit ? 0xffffffcf : 0xffffffff); + tu_cs_emit(cs, 0xffffffff); + tu_cs_emit(cs, 0xffffffff); + tu_cs_emit(cs, 0xffffffff); + + tu_cs_emit_regs(cs, A6XX_SP_VS_OUT_REG(0, + .a_regid = 0, .a_compmask = 0xf, + .b_regid = 4, .b_compmask = 0x3)); + tu_cs_emit_regs(cs, A6XX_SP_VS_VPC_DST_REG(0, .outloc0 = 0, .outloc1 = 4)); + + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1); + tu_cs_emit(cs, 0xff00ff00 | + COND(blit, A6XX_VPC_CNTL_0_VARYING) | + A6XX_VPC_CNTL_0_NUMNONPOSVAR(blit ? 8 : 0)); + + tu_cs_emit_regs(cs, A6XX_VPC_PACK( + .positionloc = 0, + .psizeloc = 0xff, + .stride_in_vpc = blit ? 6 : 4)); + tu_cs_emit_regs(cs, A6XX_SP_PRIMITIVE_CNTL(.vsout = blit ? 2 : 1)); + tu_cs_emit_regs(cs, + A6XX_PC_PRIMITIVE_CNTL_0(), + A6XX_PC_PRIMITIVE_CNTL_1(.stride_in_vpc = blit ? 6 : 4)); + + + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); + tu_cs_emit(cs, blit ? 0xe000 : 0); // I think this can just be 0 + for (uint32_t i = 1; i < 8; i++) + tu_cs_emit(cs, 0); + + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); + for (uint32_t i = 0; i < 8; i++) + tu_cs_emit(cs, 0x99999999); + + /* fs inputs: none, prefetch in blit case */ + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + blit); + tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(blit) | + A6XX_SP_FS_PREFETCH_CNTL_UNK4(0xfc) | + 0x7000); + if (blit) { + tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(4) | + A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(0) | + A6XX_SP_FS_PREFETCH_CMD_TEX_ID(0) | + A6XX_SP_FS_PREFETCH_CMD_DST(0) | + A6XX_SP_FS_PREFETCH_CMD_WRMASK(0xf) | + A6XX_SP_FS_PREFETCH_CMD_CMD(0x4)); + } + + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); + tu_cs_emit(cs, 0x3); // XXX blob uses 3 in blit path + tu_cs_emit(cs, 0xfcfcfcfc); + tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(blit ? 0 : 0xfc) | + A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(0xfc) | + 0xfc00fc00); + tu_cs_emit(cs, 0xfcfcfcfc); + tu_cs_emit(cs, 0xfcfc); + + tu_cs_emit_regs(cs, A6XX_HLSQ_UNKNOWN_B980(blit ? 3 : 1)); + tu_cs_emit_regs(cs, A6XX_GRAS_CNTL(.varying = blit)); + tu_cs_emit_regs(cs, + A6XX_RB_RENDER_CONTROL0(.varying = blit, .unk10 = blit), + A6XX_RB_RENDER_CONTROL1()); + + tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_CNTL()); + tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8101()); + tu_cs_emit_regs(cs, A6XX_GRAS_SAMPLE_CNTL()); + + /* shaders */ + struct ts_cs_memory shaders = { }; + VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, 16 * sizeof(instr_t), &shaders); + assert(result == VK_SUCCESS); + + memcpy(shaders.map, vs_code, sizeof(vs_code)); + + instr_t *fs = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET); + for (uint32_t i = 0; i < num_rts; i++) { + /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */ + fs[i] = (instr_t) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, + .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4 } }; + } + fs[num_rts] = (instr_t) { .cat0 = { .opc = OPC_END } }; + /* note: assumed <= 16 instructions (MAX_RTS is 8) */ + + tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff)); + tu_cs_emit_regs(cs, + A6XX_HLSQ_VS_CNTL(.constlen = 8, .enabled = true), + A6XX_HLSQ_HS_CNTL(), + A6XX_HLSQ_DS_CNTL(), + A6XX_HLSQ_GS_CNTL()); + tu_cs_emit_regs(cs, A6XX_HLSQ_FS_CNTL(.constlen = 4 * num_rts, .enabled = true)); + + tu_cs_emit_regs(cs, + A6XX_SP_VS_CONFIG(.enabled = true), + A6XX_SP_VS_INSTRLEN(1)); + tu_cs_emit_regs(cs, A6XX_SP_HS_CONFIG()); + tu_cs_emit_regs(cs, A6XX_SP_DS_CONFIG()); + tu_cs_emit_regs(cs, A6XX_SP_GS_CONFIG()); + tu_cs_emit_regs(cs, + A6XX_SP_FS_CONFIG(.enabled = true, .ntex = blit, .nsamp = blit), + A6XX_SP_FS_INSTRLEN(1)); + + tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0( + .threadsize = FOUR_QUADS, + .fullregfootprint = 2, + .mergedregs = true)); + tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0( + .varying = blit, + .threadsize = FOUR_QUADS, + /* could this be 0 in !blit && !num_rts case ? */ + .fullregfootprint = MAX2(1, num_rts), + .mergedregs = true)); /* note: tu_pipeline also sets 0x1000000 bit */ + + tu_cs_emit_regs(cs, A6XX_SP_IBO_COUNT(0)); + + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit_qw(cs, shaders.iova); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OBJ_START_LO, 2); + tu_cs_emit_qw(cs, shaders.iova); + + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OBJ_START_LO, 2); + tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET); + + tu_cs_emit_regs(cs, + A6XX_GRAS_CL_CNTL( + .persp_division_disable = 1, + .vp_xform_disable = 1, + .vp_clip_code_ignore = 1, + .clip_disable = 1), + A6XX_GRAS_UNKNOWN_8001(0)); + tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable? + + tu_cs_emit_regs(cs, + A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0), + A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff)); + tu_cs_emit_regs(cs, + A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0), + A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff)); +} + +static void +r3d_coords_raw(struct tu_cs *cs, const float *coords) +{ + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(2)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + tu_cs_emit_array(cs, (const uint32_t *) coords, 8); +} + +static void +r3d_coords(struct tu_cs *cs, + const VkOffset2D *dst, + const VkOffset2D *src, + const VkExtent2D *extent) +{ + int32_t src_x1 = src ? src->x : 0; + int32_t src_y1 = src ? src->y : 0; + r3d_coords_raw(cs, (float[]) { + dst->x, dst->y, + src_x1, src_y1, + dst->x + extent->width, dst->y + extent->height, + src_x1 + extent->width, src_y1 + extent->height, + }); +} + +static void +r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val) +{ + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + switch (format) { + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: { + /* cleared as r8g8b8a8_unorm using special format */ + uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24); + tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f)); + tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f)); + tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f)); + tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f)); + } break; + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D32_SFLOAT: + tu_cs_emit(cs, fui(val->depthStencil.depth)); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + break; + case VK_FORMAT_S8_UINT: + tu_cs_emit(cs, val->depthStencil.stencil & 0xff); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + break; + default: + /* as color formats use clear value as-is */ + assert(!vk_format_is_depth_or_stencil(format)); + tu_cs_emit_array(cs, val->color.uint32, 4); + break; + } +} + +static void +r3d_src_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t *tex_const, bool linear_filter) +{ + struct ts_cs_memory texture = { }; + VkResult result = tu_cs_alloc(&cmd->sub_cs, + 2, /* allocate space for a sampler too */ + A6XX_TEX_CONST_DWORDS, &texture); + assert(result == VK_SUCCESS); + + memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4); + + texture.map[A6XX_TEX_CONST_DWORDS + 0] = + A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) | + A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) | + A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) | + A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) | + A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) | + 0x60000; /* XXX used by blob, doesn't seem necessary */ + texture.map[A6XX_TEX_CONST_DWORDS + 1] = + 0x1 | /* XXX used by blob, doesn't seem necessary */ + A6XX_TEX_SAMP_1_UNNORM_COORDS | + A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR; + texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0; + texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0; + + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2); + tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4); + + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit_qw(cs, texture.iova); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2); + tu_cs_emit_qw(cs, texture.iova); + + tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1)); +} + +static void +r3d_src(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image *image, + VkFormat format, + uint32_t level, + uint32_t layer, + bool linear_filter, + bool stencil_read) +{ + struct tu_image_view view; + + /* use tu_image_view_init to fill out a view descriptor */ + tu_image_view_init(&view, cmd->device, &(VkImageViewCreateInfo) { + .image = tu_image_to_handle(image), + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = format, + /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */ + .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = level, + .levelCount = 1, + .baseArrayLayer = layer, + .layerCount = 1, + }, + }); + r3d_src_common(cmd, cs, view.descriptor, linear_filter); +} + +static void +r3d_src_buffer(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat vk_format, + uint64_t va, uint32_t pitch, + uint32_t width, uint32_t height) +{ + uint32_t desc[A6XX_TEX_CONST_DWORDS]; + + struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR); + + desc[0] = + COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) | + A6XX_TEX_CONST_0_FMT(format.fmt) | + A6XX_TEX_CONST_0_SWAP(format.swap) | + A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) | + // XXX to swizzle into .w for stencil buffer_to_image + A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) | + A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) | + A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W); + desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height); + desc[2] = + A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) | + A6XX_TEX_CONST_2_PITCH(pitch) | + A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D); + desc[3] = 0; + desc[4] = va; + desc[5] = va >> 32; + for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++) + desc[i] = 0; + + r3d_src_common(cmd, cs, desc, false); +} + +static void +r3d_dst(struct tu_cs *cs, + struct tu_image *image, + VkFormat vk_format, + uint32_t level, + uint32_t layer) +{ + tu6_emit_msaa(cs, image->samples); /* TODO: move to setup */ + + struct tu_native_format format = tu6_format_image(image, vk_format, level); + + tu_cs_emit_regs(cs, + A6XX_RB_MRT_BUF_INFO(0, + .color_tile_mode = format.tile_mode, + .color_format = format.fmt, + .color_swap = format.swap), + A6XX_RB_MRT_PITCH(0, tu_image_pitch(image, level)), + A6XX_RB_MRT_ARRAY_PITCH(0, image->layout.layer_size), + A6XX_RB_MRT_BASE(0, tu_image_base_ref(image, level, layer)), + A6XX_RB_MRT_BASE_GMEM(0, 0)); + + tu_cs_emit_regs(cs, + A6XX_RB_MRT_FLAG_BUFFER_ADDR(0, tu_image_ubwc_base_ref(image, level, layer)), + A6XX_RB_MRT_FLAG_BUFFER_PITCH(0, .pitch = tu_image_ubwc_pitch(image, level))); + + tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = image->layout.ubwc_layer_size != 0)); +} + +static void +r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) +{ + struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR); + + tu6_emit_msaa(cs, 1); /* TODO: move to setup */ + + tu_cs_emit_regs(cs, + A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap), + A6XX_RB_MRT_PITCH(0, pitch), + A6XX_RB_MRT_ARRAY_PITCH(0, 0), + A6XX_RB_MRT_BASE_LO(0, (uint32_t) va), + A6XX_RB_MRT_BASE_HI(0, va >> 32), + A6XX_RB_MRT_BASE_GMEM(0, 0)); + + tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); +} + +static void +r3d_setup(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat vk_format, + enum a6xx_rotation rotation, + bool clear, + uint8_t mask) +{ + const struct tu_physical_device *phys_dev = cmd->device->physical_device; + + if (!cmd->state.pass) { + /* TODO: flushing with barriers instead of blindly always flushing */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false); + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + + tu_cs_emit_regs(cs, + A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass)); + + tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff); + } + tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000)); + tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000)); + + r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | + A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) | + 0xfc000000); + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1)); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1); + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0)); + + tu_cs_emit_regs(cs, + A6XX_RB_FS_OUTPUT_CNTL0(), + A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1)); + + tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL()); + tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff)); + tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); + + tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL()); + tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL()); + tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); + tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL()); + tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK()); + tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK()); + tu_cs_emit_regs(cs, A6XX_RB_STENCILREF()); + + tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf)); + tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf)); + + tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0, + .color_format = tu6_base_format(vk_format), + .color_sint = vk_format_is_sint(vk_format), + .color_uint = vk_format_is_uint(vk_format))); + + tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask)); + tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format))); + tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format))); +} + +static void +r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); + tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) | + CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) | + CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY)); + tu_cs_emit(cs, 1); /* instance count */ + tu_cs_emit(cs, 2); /* vertex count */ + + if (!cmd->state.pass) { + /* TODO: flushing with barriers instead of blindly always flushing */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + } +} + +/* blit ops - common interface for 2d/shader paths */ + +struct blit_ops { + void (*coords)(struct tu_cs *cs, + const VkOffset2D *dst, + const VkOffset2D *src, + const VkExtent2D *extent); + void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val); + void (*src)( + struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image *image, + VkFormat format, + uint32_t level, + uint32_t layer, + bool linear_filter, + bool stencil_read); + void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + VkFormat vk_format, + uint64_t va, uint32_t pitch, + uint32_t width, uint32_t height); + void (*dst)(struct tu_cs *cs, + struct tu_image *image, + VkFormat format, + uint32_t level, + uint32_t layer); + void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch); + void (*setup)(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat vk_format, + enum a6xx_rotation rotation, + bool clear, + uint8_t mask); + void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs); +}; + +static const struct blit_ops r2d_ops = { + .coords = r2d_coords, + .clear_value = r2d_clear_value, + .src = r2d_src, + .src_buffer = r2d_src_buffer, + .dst = r2d_dst, + .dst_buffer = r2d_dst_buffer, + .setup = r2d_setup, + .run = r2d_run, +}; + +static const struct blit_ops r3d_ops = { + .coords = r3d_coords, + .clear_value = r3d_clear_value, + .src = r3d_src, + .src_buffer = r3d_src_buffer, + .dst = r3d_dst, + .dst_buffer = r3d_dst_buffer, + .setup = r3d_setup, + .run = r3d_run, +}; + +/* passthrough set coords from 3D extents */ +static void +coords(const struct blit_ops *ops, + struct tu_cs *cs, + const VkOffset3D *dst, + const VkOffset3D *src, + const VkExtent3D *extent) +{ + ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent); +} + +static void +tu6_blit_image(struct tu_cmd_buffer *cmd, + struct tu_image *src_image, + struct tu_image *dst_image, + const VkImageBlit *info, + VkFilter filter) +{ + const struct blit_ops *ops = &r2d_ops; + struct tu_cs *cs = &cmd->cs; + uint32_t layers; + + /* 2D blit can't do rotation mirroring from just coordinates */ + static const enum a6xx_rotation rotate[2][2] = { + {ROTATE_0, ROTATE_HFLIP}, + {ROTATE_VFLIP, ROTATE_180}, + }; + + bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) != + (info->dstOffsets[1].x < info->dstOffsets[0].x); + bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) != + (info->dstOffsets[1].y < info->dstOffsets[0].y); + bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) != + (info->dstOffsets[1].z < info->dstOffsets[0].z); + + if (mirror_z) { + tu_finishme("blit z mirror\n"); + return; + } + + if (info->srcOffsets[1].z - info->srcOffsets[0].z != + info->dstOffsets[1].z - info->dstOffsets[0].z) { + tu_finishme("blit z filter\n"); + return; + } + + layers = info->srcOffsets[1].z - info->srcOffsets[0].z; + if (info->dstSubresource.layerCount > 1) { + assert(layers <= 1); + layers = info->dstSubresource.layerCount; + } + + uint8_t mask = 0xf; + if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { + assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask); + if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT) + mask = 0x7; + if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) + mask = 0x8; + } + + if (dst_image->samples > 1) + ops = &r3d_ops; + + /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests, + * figure out why (should be able to pass all tests with only shader path) + */ + + ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask); + + if (ops == &r3d_ops) { + r3d_coords_raw(cs, (float[]) { + info->dstOffsets[0].x, info->dstOffsets[0].y, + info->srcOffsets[0].x, info->srcOffsets[0].y, + info->dstOffsets[1].x, info->dstOffsets[1].y, + info->srcOffsets[1].x, info->srcOffsets[1].y + }); + } else { + tu_cs_emit_regs(cs, + A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x), + .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)), + A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1, + .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1)); + tu_cs_emit_regs(cs, + A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)), + A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1), + A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)), + A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1)); + } + + for (uint32_t i = 0; i < layers; i++) { + ops->src(cmd, cs, src_image, src_image->vk_format, + info->srcSubresource.mipLevel, + info->srcSubresource.baseArrayLayer + info->srcOffsets[0].z + i, + filter == VK_FILTER_LINEAR, false); + ops->dst(cs, dst_image, dst_image->vk_format, + info->dstSubresource.mipLevel, + info->dstSubresource.baseArrayLayer + info->dstOffsets[0].z + i); + ops->run(cmd, cs); + } +} + +void +tu_CmdBlitImage(VkCommandBuffer commandBuffer, + VkImage srcImage, + VkImageLayout srcImageLayout, + VkImage dstImage, + VkImageLayout dstImageLayout, + uint32_t regionCount, + const VkImageBlit *pRegions, + VkFilter filter) + +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_image, src_image, srcImage); + TU_FROM_HANDLE(tu_image, dst_image, dstImage); + + tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); + + for (uint32_t i = 0; i < regionCount; ++i) + tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter); +} + +static VkFormat +copy_format(VkFormat format) +{ + switch (vk_format_get_blocksizebits(format)) { + case 8: return VK_FORMAT_R8_UINT; + case 16: return VK_FORMAT_R16_UINT; + case 32: return VK_FORMAT_R32_UINT; + case 64: return VK_FORMAT_R32G32_UINT; + case 96: return VK_FORMAT_R32G32B32_UINT; + case 128:return VK_FORMAT_R32G32B32A32_UINT; + default: + unreachable("unhandled format size"); + } +} + +static void +copy_compressed(VkFormat format, + VkOffset3D *offset, + VkExtent3D *extent, + uint32_t *pitch, + uint32_t *layer_size) +{ + if (!vk_format_is_compressed(format)) + return; + + uint32_t block_width = vk_format_get_blockwidth(format); + uint32_t block_height = vk_format_get_blockheight(format); + + offset->x /= block_width; + offset->y /= block_height; + + if (extent) { + extent->width = DIV_ROUND_UP(extent->width, block_width); + extent->height = DIV_ROUND_UP(extent->height, block_height); + } + if (pitch) + *pitch /= block_width; + if (layer_size) + *layer_size /= (block_width * block_height); +} + +static void +tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, + struct tu_buffer *src_buffer, + struct tu_image *dst_image, + const VkBufferImageCopy *info) +{ + struct tu_cs *cs = &cmd->cs; + uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); + VkFormat dst_format = dst_image->vk_format; + VkFormat src_format = dst_image->vk_format; + const struct blit_ops *ops = &r2d_ops; + uint8_t mask = 0xf; + + if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { + switch (info->imageSubresource.aspectMask) { + case VK_IMAGE_ASPECT_STENCIL_BIT: + src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */ + mask = 0x8; + ops = &r3d_ops; + break; + case VK_IMAGE_ASPECT_DEPTH_BIT: + mask = 0x7; + break; + } + } + + VkOffset3D offset = info->imageOffset; + VkExtent3D extent = info->imageExtent; + uint32_t pitch = + (info->bufferRowLength ?: extent.width) * vk_format_get_blocksize(src_format); + uint32_t layer_size = (info->bufferImageHeight ?: extent.height) * pitch; + + if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) { + assert(src_format == dst_format); + copy_compressed(dst_format, &offset, &extent, &pitch, &layer_size); + src_format = dst_format = copy_format(dst_format); + } + + /* note: the src_va/pitch alignment of 64 is for 2D engine, + * it is also valid for 1cpp format with shader path (stencil aspect path) + */ + + ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask); + + for (uint32_t i = 0; i < layers; i++) { + ops->dst(cs, dst_image, dst_format, + info->imageSubresource.mipLevel, + info->imageSubresource.baseArrayLayer + info->imageOffset.z + i); + + uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i; + if ((src_va & 63) || (pitch & 63)) { + for (uint32_t y = 0; y < extent.height; y++) { + uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format); + ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch, + x + extent.width, 1); + ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x}, + &(VkExtent2D) {extent.width, 1}); + ops->run(cmd, cs); + src_va += pitch; + } + } else { + ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height); + coords(ops, cs, &offset, &(VkOffset3D){}, &extent); + ops->run(cmd, cs); + } + } +} + +void +tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer, + VkBuffer srcBuffer, + VkImage dstImage, + VkImageLayout dstImageLayout, + uint32_t regionCount, + const VkBufferImageCopy *pRegions) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_image, dst_image, dstImage); + TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); + + tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < regionCount; ++i) + tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i); +} + +static void +tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, + struct tu_image *src_image, + struct tu_buffer *dst_buffer, + const VkBufferImageCopy *info) +{ + struct tu_cs *cs = &cmd->cs; + uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); + VkFormat src_format = src_image->vk_format; + VkFormat dst_format = src_image->vk_format; + bool stencil_read = false; + + if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT && + info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { + dst_format = VK_FORMAT_R8_UNORM; + stencil_read = true; + } + + const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops; + VkOffset3D offset = info->imageOffset; + VkExtent3D extent = info->imageExtent; + uint32_t pitch = (info->bufferRowLength ?: extent.width) * vk_format_get_blocksize(dst_format); + uint32_t layer_size = (info->bufferImageHeight ?: extent.height) * pitch; + + if (src_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) { + assert(src_format == dst_format); + copy_compressed(dst_format, &offset, &extent, &pitch, &layer_size); + src_format = dst_format = copy_format(dst_format); + } + + /* note: the dst_va/pitch alignment of 64 is for 2D engine, + * it is also valid for 1cpp format with shader path (stencil aspect) + */ + + ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf); + + for (uint32_t i = 0; i < layers; i++) { + ops->src(cmd, cs, src_image, src_format, + info->imageSubresource.mipLevel, + info->imageSubresource.baseArrayLayer + info->imageOffset.z + i, + false, stencil_read); + + uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i; + if ((dst_va & 63) || (pitch & 63)) { + for (uint32_t y = 0; y < extent.height; y++) { + uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format); + ops->dst_buffer(cs, dst_format, dst_va & ~63, 0); + ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y}, + &(VkExtent2D) {extent.width, 1}); + ops->run(cmd, cs); + dst_va += pitch; + } + } else { + ops->dst_buffer(cs, dst_format, dst_va, pitch); + coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent); + ops->run(cmd, cs); + } + } +} + +void +tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer, + VkImage srcImage, + VkImageLayout srcImageLayout, + VkBuffer dstBuffer, + uint32_t regionCount, + const VkBufferImageCopy *pRegions) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_image, src_image, srcImage); + TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer); + + tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < regionCount; ++i) + tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i); +} + +static void +tu_copy_image_to_image(struct tu_cmd_buffer *cmd, + struct tu_image *src_image, + struct tu_image *dst_image, + const VkImageCopy *info) +{ + const struct blit_ops *ops = &r2d_ops; + struct tu_cs *cs = &cmd->cs; + + uint8_t mask = 0xf; + if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { + if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT) + mask = 0x7; + if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) + mask = 0x8; + } + + if (dst_image->samples > 1) + ops = &r3d_ops; + + assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask); + + VkFormat format = VK_FORMAT_UNDEFINED; + VkOffset3D src_offset = info->srcOffset; + VkOffset3D dst_offset = info->dstOffset; + VkExtent3D extent = info->extent; + + /* TODO: should check (ubwc || (tile_mode && swap)) instead */ + if (src_image->layout.tile_mode && src_image->vk_format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + format = src_image->vk_format; + + if (dst_image->layout.tile_mode && dst_image->vk_format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) { + if (format != VK_FORMAT_UNDEFINED && format != dst_image->vk_format) { + /* can be clever in some cases but in some cases we need and intermediate + * linear buffer + */ + tu_finishme("image copy between two tiled/ubwc images\n"); + return; + } + format = dst_image->vk_format; + } + + if (format == VK_FORMAT_UNDEFINED) + format = copy_format(src_image->vk_format); + + copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL); + copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL); + + ops->setup(cmd, cs, format, ROTATE_0, false, mask); + coords(ops, cs, &dst_offset, &src_offset, &extent); + + for (uint32_t i = 0; i < info->extent.depth; i++) { + ops->src(cmd, cs, src_image, format, + info->srcSubresource.mipLevel, + info->srcSubresource.baseArrayLayer + info->srcOffset.z + i, + false, false); + ops->dst(cs, dst_image, format, + info->dstSubresource.mipLevel, + info->dstSubresource.baseArrayLayer + info->dstOffset.z + i); + ops->run(cmd, cs); + } +} + +void +tu_CmdCopyImage(VkCommandBuffer commandBuffer, + VkImage srcImage, + VkImageLayout srcImageLayout, + VkImage destImage, + VkImageLayout destImageLayout, + uint32_t regionCount, + const VkImageCopy *pRegions) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_image, src_image, srcImage); + TU_FROM_HANDLE(tu_image, dst_image, destImage); + + tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); + + for (uint32_t i = 0; i < regionCount; ++i) + tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i); +} + +static void +copy_buffer(struct tu_cmd_buffer *cmd, + uint64_t dst_va, + uint64_t src_va, + uint64_t size, + uint32_t block_size) +{ + const struct blit_ops *ops = &r2d_ops; + struct tu_cs *cs = &cmd->cs; + VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM; + uint64_t blocks = size / block_size; + + ops->setup(cmd, cs, format, ROTATE_0, false, 0xf); + + while (blocks) { + uint32_t src_x = (src_va & 63) / block_size; + uint32_t dst_x = (dst_va & 63) / block_size; + uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x); + + ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1); + ops->dst_buffer( cs, format, dst_va & ~63, 0); + ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1}); + ops->run(cmd, cs); + + src_va += width * block_size; + dst_va += width * block_size; + blocks -= width; + } +} + +void +tu_CmdCopyBuffer(VkCommandBuffer commandBuffer, + VkBuffer srcBuffer, + VkBuffer dstBuffer, + uint32_t regionCount, + const VkBufferCopy *pRegions) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); + TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer); + + tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < regionCount; ++i) { + copy_buffer(cmd, + tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset, + tu_buffer_iova(src_buffer) + pRegions[i].srcOffset, + pRegions[i].size, 1); + } +} + +void +tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize dataSize, + const void *pData) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); + + tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); + + struct ts_cs_memory tmp; + VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + memcpy(tmp.map, pData, dataSize); + copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4); +} + +void +tu_CmdFillBuffer(VkCommandBuffer commandBuffer, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize fillSize, + uint32_t data) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); + const struct blit_ops *ops = &r2d_ops; + struct tu_cs *cs = &cmd->cs; + + tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); + + if (fillSize == VK_WHOLE_SIZE) + fillSize = buffer->size - dstOffset; + + uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset; + uint32_t blocks = fillSize / 4; + + ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf); + ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}}); + + while (blocks) { + uint32_t dst_x = (dst_va & 63) / 4; + uint32_t width = MIN2(blocks, 0x4000 - dst_x); + + ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0); + ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1}); + ops->run(cmd, cs); + + dst_va += width * 4; + blocks -= width; + } +} + +void +tu_CmdResolveImage(VkCommandBuffer commandBuffer, + VkImage srcImage, + VkImageLayout srcImageLayout, + VkImage dstImage, + VkImageLayout dstImageLayout, + uint32_t regionCount, + const VkImageResolve *pRegions) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_image, src_image, srcImage); + TU_FROM_HANDLE(tu_image, dst_image, dstImage); + const struct blit_ops *ops = &r2d_ops; + struct tu_cs *cs = &cmd->cs; + + tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); + + ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf); + + for (uint32_t i = 0; i < regionCount; ++i) { + const VkImageResolve *info = &pRegions[i]; + uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount); + + assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount); + /* TODO: aspect masks possible ? */ + + coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent); + + for (uint32_t i = 0; i < layers; i++) { + ops->src(cmd, cs, src_image, src_image->vk_format, + info->srcSubresource.mipLevel, + info->srcSubresource.baseArrayLayer + info->srcOffset.z + i, + false, false); + ops->dst(cs, dst_image, dst_image->vk_format, + info->dstSubresource.mipLevel, + info->dstSubresource.baseArrayLayer + info->dstOffset.z + i); + ops->run(cmd, cs); + } + } +} + +void +tu_resolve_sysmem(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image_view *src, + struct tu_image_view *dst, + uint32_t layers, + const VkRect2D *rect) +{ + const struct blit_ops *ops = &r2d_ops; + + tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE); + + assert(src->vk_format == dst->vk_format); + + ops->setup(cmd, cs, dst->vk_format, ROTATE_0, false, 0xf); + ops->coords(cs, &rect->offset, &rect->offset, &rect->extent); + + for (uint32_t i = 0; i < layers; i++) { + ops->src(cmd, cs, src->image, src->vk_format, + src->base_mip, + src->base_layer + i, + false, false); + ops->dst(cs, dst->image, dst->vk_format, + dst->base_mip, + dst->base_layer + i); + ops->run(cmd, cs); + } +} + +static void +clear_image(struct tu_cmd_buffer *cmd, + struct tu_image *image, + const VkClearValue *clear_value, + const VkImageSubresourceRange *range) +{ + uint32_t level_count = tu_get_levelCount(image, range); + uint32_t layer_count = tu_get_layerCount(image, range); + struct tu_cs *cs = &cmd->cs; + VkFormat format = image->vk_format; + if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + format = VK_FORMAT_R32_UINT; + + if (image->type == VK_IMAGE_TYPE_3D) { + assert(layer_count == 1); + assert(range->baseArrayLayer == 0); + } + + uint8_t mask = 0xf; + if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { + mask = 0; + if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) + mask |= 0x7; + if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) + mask |= 0x8; + } + + const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops; + + ops->setup(cmd, cs, format, ROTATE_0, true, mask); + ops->clear_value(cs, image->vk_format, clear_value); + + for (unsigned j = 0; j < level_count; j++) { + if (image->type == VK_IMAGE_TYPE_3D) + layer_count = u_minify(image->extent.depth, range->baseMipLevel + j); + + ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) { + u_minify(image->extent.width, range->baseMipLevel + j), + u_minify(image->extent.height, range->baseMipLevel + j) + }); + + for (uint32_t i = 0; i < layer_count; i++) { + ops->dst(cs, image, format, range->baseMipLevel + j, range->baseArrayLayer + i); + ops->run(cmd, cs); + } + } +} + +void +tu_CmdClearColorImage(VkCommandBuffer commandBuffer, + VkImage image_h, + VkImageLayout imageLayout, + const VkClearColorValue *pColor, + uint32_t rangeCount, + const VkImageSubresourceRange *pRanges) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_image, image, image_h); + + tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < rangeCount; i++) + clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i); +} + +void +tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, + VkImage image_h, + VkImageLayout imageLayout, + const VkClearDepthStencilValue *pDepthStencil, + uint32_t rangeCount, + const VkImageSubresourceRange *pRanges) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_image, image, image_h); + + tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < rangeCount; i++) + clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i); +} + +static void +tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd, + uint32_t attachment_count, + const VkClearAttachment *attachments, + uint32_t rect_count, + const VkClearRect *rects) +{ + const struct tu_subpass *subpass = cmd->state.subpass; + /* note: cannot use shader path here.. there is a special shader path + * in tu_clear_sysmem_attachments() + */ + const struct blit_ops *ops = &r2d_ops; + struct tu_cs *cs = &cmd->draw_cs; + + for (uint32_t j = 0; j < attachment_count; j++) { + uint32_t a; + uint8_t mask = 0xf; + + if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + a = subpass->color_attachments[attachments[j].colorAttachment].attachment; + } else { + a = subpass->depth_stencil_attachment.attachment; + + /* sync depth into color */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); + /* also flush color to avoid losing contents from invalidate */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false); + + + if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) + mask &= ~0x7; + if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)) + mask &= ~0x8; + } + + if (a == VK_ATTACHMENT_UNUSED) + continue; + + const struct tu_image_view *iview = + cmd->state.framebuffer->attachments[a].attachment; + + ops->setup(cmd, cs, iview->vk_format, ROTATE_0, true, mask); + ops->clear_value(cs, iview->vk_format, &attachments[j].clearValue); + + for (uint32_t i = 0; i < rect_count; i++) { + ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent); + for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) { + ops->dst(cs, iview->image, iview->vk_format, iview->base_mip, + iview->base_layer + rects[i].baseArrayLayer + layer); + ops->run(cmd, cs); + } + } + + if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + /* does not use CCU - flush + * note: cache invalidate might be needed to, and just not covered by test cases + */ + if (attachments[j].colorAttachment > 0) + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + } else { + /* sync color into depth */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false); + } + } +} + +static void +tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, + uint32_t attachment_count, + const VkClearAttachment *attachments, + uint32_t rect_count, + const VkClearRect *rects) +{ + /* the shader path here is special, it avoids changing MRT/etc state */ + const struct tu_render_pass *pass = cmd->state.pass; + const struct tu_subpass *subpass = cmd->state.subpass; + const uint32_t mrt_count = subpass->color_count; + struct tu_cs *cs = &cmd->draw_cs; + uint32_t clear_value[MAX_RTS][4]; + float z_clear_val = 0.0f; + uint8_t s_clear_val = 0; + uint32_t clear_rts = 0, num_rts = 0, b; + bool z_clear = false; + bool s_clear = false; + uint32_t max_samples = 1; + + for (uint32_t i = 0; i < attachment_count; i++) { + uint32_t a; + if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + uint32_t c = attachments[i].colorAttachment; + a = subpass->color_attachments[c].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + clear_rts |= 1 << c; + memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t)); + } else { + a = subpass->depth_stencil_attachment.attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { + z_clear = true; + z_clear_val = attachments[i].clearValue.depthStencil.depth; + } + + if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { + s_clear = true; + s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff; + } + } + + max_samples = MAX2(max_samples, pass->attachments[a].samples); + } + + /* prefer to use 2D path for clears + * 2D can't clear separate depth/stencil and msaa, needs known framebuffer + */ + if (max_samples == 1 && cmd->state.framebuffer) { + tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects); + return; + } + + /* TODO: this path doesn't take into account multilayer rendering */ + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | + A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) | + 0xfc000000); + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count); + for (uint32_t i = 0; i < mrt_count; i++) { + if (clear_rts & (1 << i)) + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4)); + else + tu_cs_emit(cs, 0); + } + + r3d_pipeline(cmd, cs, false, num_rts); + + tu_cs_emit_regs(cs, + A6XX_RB_FS_OUTPUT_CNTL0(), + A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count)); + + tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL()); + tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff)); + tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); + for (uint32_t i = 0; i < mrt_count; i++) { + tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i, + .component_enable = COND(clear_rts & (1 << i), 0xf))); + } + + tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL()); + tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL( + .z_enable = z_clear, + .z_write_enable = z_clear, + .zfunc = FUNC_ALWAYS)); + tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); + tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL( + .stencil_enable = s_clear, + .func = FUNC_ALWAYS, + .zpass = VK_STENCIL_OP_REPLACE)); + tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff)); + tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff)); + tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val)); + + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(num_rts)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + for_each_bit(b, clear_rts) + tu_cs_emit_array(cs, clear_value[b], 4); + + for (uint32_t i = 0; i < rect_count; i++) { + r3d_coords_raw(cs, (float[]) { + rects[i].rect.offset.x, rects[i].rect.offset.y, + z_clear_val, 1.0f, + rects[i].rect.offset.x + rects[i].rect.extent.width, + rects[i].rect.offset.y + rects[i].rect.extent.height, + z_clear_val, 1.0f + }); + r3d_run(cmd, cs); + } + + cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE | + TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK | + TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | + TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | + TU_CMD_DIRTY_DYNAMIC_VIEWPORT | + TU_CMD_DIRTY_DYNAMIC_SCISSOR; +} + +/** + * Pack a VkClearValue into a 128-bit buffer. format is respected except + * for the component order. The components are always packed in WZYX order, + * because gmem is tiled and tiled formats always have WZYX swap + */ +static void +pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4]) +{ + const struct util_format_description *desc = vk_format_description(format); + + switch (format) { + case VK_FORMAT_B10G11R11_UFLOAT_PACK32: + buf[0] = float3_to_r11g11b10f(val->color.float32); + return; + case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: + buf[0] = float3_to_rgb9e5(val->color.float32); + return; + default: + break; + } + + assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); + + /* S8_UINT is special and has no depth */ + const int max_components = + format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels; + + int buf_offset = 0; + int bit_shift = 0; + for (int comp = 0; comp < max_components; comp++) { + const struct util_format_channel_description *ch = + tu_get_format_channel_description(desc, comp); + if (!ch) { + assert((format == VK_FORMAT_S8_UINT && comp == 0) || + (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1)); + continue; + } + + union tu_clear_component_value v = tu_get_clear_component_value( + val, comp, desc->colorspace); + + /* move to the next uint32_t when there is not enough space */ + assert(ch->size <= 32); + if (bit_shift + ch->size > 32) { + buf_offset++; + bit_shift = 0; + } + + if (bit_shift == 0) + buf[buf_offset] = 0; + + buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift; + bit_shift += ch->size; + } +} + +static void +tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t attachment, + uint8_t component_mask, + const VkClearValue *value) +{ + VkFormat vk_format = cmd->state.pass->attachments[attachment].format; + /* note: component_mask is 0x7 for depth and 0x8 for stencil + * because D24S8 is cleared with AS_R8G8B8A8 format + */ + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format))); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); + tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); + tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); + tu_cs_emit(cs, 0); + + uint32_t clear_vals[4] = {}; + pack_gmem_clear_value(value, vk_format, clear_vals); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); + tu_cs_emit_array(cs, clear_vals, 4); + + tu6_emit_event_write(cmd, cs, BLIT, false); +} + +static void +tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, + uint32_t attachment_count, + const VkClearAttachment *attachments, + uint32_t rect_count, + const VkClearRect *rects) +{ + const struct tu_subpass *subpass = cmd->state.subpass; + struct tu_cs *cs = &cmd->draw_cs; + + /* TODO: swap the loops for smaller cmdstream */ + for (unsigned i = 0; i < rect_count; i++) { + unsigned x1 = rects[i].rect.offset.x; + unsigned y1 = rects[i].rect.offset.y; + unsigned x2 = x1 + rects[i].rect.extent.width - 1; + unsigned y2 = y1 + rects[i].rect.extent.height - 1; + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); + tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1)); + tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2)); + + for (unsigned j = 0; j < attachment_count; j++) { + uint32_t a; + unsigned clear_mask = 0; + if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + clear_mask = 0xf; + a = subpass->color_attachments[attachments[j].colorAttachment].attachment; + } else { + a = subpass->depth_stencil_attachment.attachment; + if (attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) + clear_mask |= 0x7; + if (attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) + clear_mask |= 0x8; + } + + if (a == VK_ATTACHMENT_UNUSED) + continue; + + tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask, + &attachments[j].clearValue); + } + } +} + +void +tu_CmdClearAttachments(VkCommandBuffer commandBuffer, + uint32_t attachmentCount, + const VkClearAttachment *pAttachments, + uint32_t rectCount, + const VkClearRect *pRects) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + struct tu_cs *cs = &cmd->draw_cs; + + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); + tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_cond_exec_end(cs); + + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); + tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_cond_exec_end(cs); +} + +void +tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + const VkRenderPassBeginInfo *info) +{ + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct tu_render_pass_attachment *attachment = + &cmd->state.pass->attachments[a]; + uint8_t mask = 0; + + if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) + mask = 0xf; + + if (iview->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { + mask &= 0x7; + if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) + mask |= 0x8; + } + + /* gmem_offset<0 means it isn't used by any subpass and shouldn't be cleared */ + if (attachment->gmem_offset < 0 || !mask) + return; + + const struct blit_ops *ops = &r2d_ops; + if (attachment->samples > 1) + ops = &r3d_ops; + + ops->setup(cmd, cs, iview->vk_format, ROTATE_0, true, mask); + ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent); + ops->clear_value(cs, iview->vk_format, &info->pClearValues[a]); + + for (uint32_t i = 0; i < fb->layers; i++) { + ops->dst(cs, iview->image, iview->vk_format, iview->base_mip, iview->base_layer + i); + ops->run(cmd, cs); + } +} + +void +tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + const VkRenderPassBeginInfo *info) +{ + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct tu_render_pass_attachment *attachment = + &cmd->state.pass->attachments[a]; + unsigned clear_mask = 0; + + /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */ + if (attachment->gmem_offset < 0) + return; + + if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) + clear_mask = 0xf; + + if (vk_format_has_stencil(iview->vk_format)) { + clear_mask &= 0x7; + if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) + clear_mask |= 0x8; + } + if (!clear_mask) + return; + + tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); + + tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask, + &info->pClearValues[a]); +} + +static void +tu_emit_blit(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_image_view *iview, + struct tu_render_pass_attachment *attachment, + bool resolve) +{ + const struct tu_native_format format = + tu6_format_image(iview->image, iview->vk_format, iview->base_mip); + + tu_cs_emit_regs(cs, + A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); + + tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO( + .unk0 = !resolve, + .gmem = !resolve, + /* "integer" bit disables msaa resolve averaging */ + .integer = vk_format_is_int(iview->vk_format))); + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_DST_INFO( + .tile_mode = format.tile_mode, + .samples = tu_msaa_samples(iview->image->samples), + .color_format = format.fmt, + .color_swap = format.swap, + .flags = iview->image->layout.ubwc_layer_size != 0), + A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)), + A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)), + A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size)); + + if (iview->image->layout.ubwc_layer_size) { + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)), + A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview))); + } + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); + + tu6_emit_event_write(cmd, cs, BLIT, false); +} + +static bool +blit_can_resolve(VkFormat format) +{ + const struct util_format_description *desc = vk_format_description(format); + + /* blit event can only do resolve for simple cases: + * averaging samples as unsigned integers or choosing only one sample + */ + if (vk_format_is_snorm(format) || vk_format_is_srgb(format)) + return false; + + /* can't do formats with larger channel sizes + * note: this includes all float formats + * note2: single channel integer formats seem OK + */ + if (desc->channel[0].size > 10) + return false; + + switch (format) { + /* for unknown reasons blit event can't msaa resolve these formats when tiled + * likely related to these formats having different layout from other cpp=2 formats + */ + case VK_FORMAT_R8G8_UNORM: + case VK_FORMAT_R8G8_UINT: + case VK_FORMAT_R8G8_SINT: + /* TODO: this one should be able to work? */ + case VK_FORMAT_D24_UNORM_S8_UINT: + return false; + default: + break; + } + + return true; +} + +void +tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a) +{ + tu_emit_blit(cmd, cs, + cmd->state.framebuffer->attachments[a].attachment, + &cmd->state.pass->attachments[a], + false); +} + +void +tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a) +{ + const struct tu_render_pass_attachment *attachment = + &cmd->state.pass->attachments[a]; + + if (attachment->gmem_offset < 0) + return; + + if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD || + (vk_format_has_stencil(attachment->format) && + attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD)) { + tu_emit_load_gmem_attachment(cmd, cs, a); + } +} + +void +tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + uint32_t gmem_a) +{ + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + const VkRect2D *render_area = &tiling->render_area; + struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a]; + struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment; + struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a]; + + if (dst->store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE) + return; + + uint32_t x1 = render_area->offset.x; + uint32_t y1 = render_area->offset.y; + uint32_t x2 = x1 + render_area->extent.width; + uint32_t y2 = y1 + render_area->extent.height; + /* x2/y2 can be unaligned if equal to the size of the image, + * since it will write into padding space + * the one exception is linear levels which don't have the + * required y padding in the layout (except for the last level) + */ + bool need_y2_align = + y2 != iview->extent.height || + (tu6_get_image_tile_mode(iview->image, iview->base_mip) == TILE6_LINEAR && + iview->base_mip != iview->image->level_count - 1); + + bool unaligned = + x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) || + y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align); + + /* use fast path when render area is aligned, except for unsupported resolve cases */ + if (!unaligned && (a == gmem_a || blit_can_resolve(iview->vk_format))) { + tu_emit_blit(cmd, cs, iview, src, true); + return; + } + + if (dst->samples > 1) { + /* I guess we need to use shader path in this case? + * need a testcase which fails because of this + */ + tu_finishme("unaligned store of msaa attachment\n"); + return; + } + + r2d_setup_common(cmd, cs, iview->vk_format, ROTATE_0, false, 0xf, true); + r2d_dst(cs, iview->image, iview->vk_format, iview->base_mip, iview->base_layer); + r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); + + tu_cs_emit_regs(cs, + A6XX_SP_PS_2D_SRC_INFO( + .color_format = tu6_format_texture(src->format, TILE6_2).fmt, + .tile_mode = TILE6_2, + .srgb = vk_format_is_srgb(src->format), + .samples = tu_msaa_samples(src->samples), + .samples_average = !vk_format_is_int(src->format), + .unk20 = 1, + .unk22 = 1), + /* note: src size does not matter when not scaling */ + A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), + A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset), + A6XX_SP_PS_2D_SRC_HI(), + A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp)); + + /* sync GMEM writes with CACHE */ + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + + tu_cs_emit_pkt7(cs, CP_BLIT, 1); + tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + /* TODO: flushing with barriers instead of blindly always flushing */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); +} diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 487db2a89ff..0583be32ecd 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -33,7 +33,6 @@ #include "vk_format.h" #include "tu_cs.h" -#include "tu_blit.h" #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0) @@ -111,69 +110,6 @@ tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other) return VK_SUCCESS; } -static bool -is_linear_mipmapped(const struct tu_image_view *iview) -{ - return iview->image->layout.tile_mode == TILE6_LINEAR && - iview->base_mip != iview->image->level_count - 1; -} - -static bool -force_sysmem(const struct tu_cmd_buffer *cmd, - const struct VkRect2D *render_area) -{ - const struct tu_framebuffer *fb = cmd->state.framebuffer; - bool has_linear_mipmapped_store = false; - const struct tu_render_pass *pass = cmd->state.pass; - - /* Layered rendering requires sysmem. */ - if (fb->layers > 1) - return true; - - /* Iterate over all the places we call tu6_emit_store_attachment() */ - for (unsigned i = 0; i < pass->subpass_count; i++) { - const struct tu_subpass *subpass = &pass->subpasses[i]; - if (subpass->resolve_attachments) { - for (unsigned i = 0; i < subpass->color_count; i++) { - uint32_t a = subpass->resolve_attachments[i].attachment; - if (a != VK_ATTACHMENT_UNUSED && - cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_STORE) { - const struct tu_image_view *iview = fb->attachments[a].attachment; - if (is_linear_mipmapped(iview)) { - has_linear_mipmapped_store = true; - break; - } - } - } - } - } - - for (unsigned i = 0; i < pass->attachment_count; i++) { - if (pass->attachments[i].gmem_offset >= 0 && - cmd->state.pass->attachments[i].store_op == VK_ATTACHMENT_STORE_OP_STORE) { - const struct tu_image_view *iview = fb->attachments[i].attachment; - if (is_linear_mipmapped(iview)) { - has_linear_mipmapped_store = true; - break; - } - } - } - - /* Linear textures cannot have any padding between mipmap levels and their - * height isn't padded, while at the same time the GMEM->MEM resolve does - * not have per-pixel granularity, so if the image height isn't aligned to - * the resolve granularity and the render area is tall enough, we may wind - * up writing past the bottom of the image into the next miplevel or even - * past the end of the image. For the last miplevel, the layout code should - * insert enough padding so that the overdraw writes to the padding. To - * work around this, we force-enable sysmem rendering. - */ - const uint32_t y2 = render_area->offset.y + render_area->extent.height; - const uint32_t aligned_y2 = ALIGN_POT(y2, GMEM_ALIGN_H); - - return has_linear_mipmapped_store && aligned_y2 > fb->height; -} - static void tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling, const struct tu_device *dev, @@ -421,10 +357,6 @@ tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } } -#define tu_image_view_ubwc_pitches(iview) \ - .pitch = tu_image_ubwc_pitch(iview->image, iview->base_mip), \ - .array_pitch = tu_image_ubwc_size(iview->image, iview->base_mip) >> 2 - static void tu6_emit_zs(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass, @@ -497,20 +429,18 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd, continue; const struct tu_image_view *iview = fb->attachments[a].attachment; - const enum a6xx_tile_mode tile_mode = - tu6_get_image_tile_mode(iview->image, iview->base_mip); mrt_comp[i] = 0xf; if (vk_format_is_srgb(iview->vk_format)) srgb_cntl |= (1 << i); - const struct tu_native_format format = - tu6_format_color(iview->vk_format, iview->image->layout.tile_mode); + struct tu_native_format format = + tu6_format_image(iview->image, iview->vk_format, iview->base_mip); tu_cs_emit_regs(cs, A6XX_RB_MRT_BUF_INFO(i, - .color_tile_mode = tile_mode, + .color_tile_mode = format.tile_mode, .color_format = format.fmt, .color_swap = format.swap), A6XX_RB_MRT_PITCH(i, tu_image_stride(iview->image, iview->base_mip)), @@ -563,12 +493,10 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd, .type = LAYER_2D_ARRAY)); } -static void -tu6_emit_msaa(struct tu_cmd_buffer *cmd, - const struct tu_subpass *subpass, - struct tu_cs *cs) +void +tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples) { - const enum a3xx_msaa_samples samples = tu_msaa_samples(subpass->samples); + const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples); bool msaa_disable = samples == MSAA_ONE; tu_cs_emit_regs(cs, @@ -681,51 +609,8 @@ tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align) A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); } -static void -tu6_emit_blit_info(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - const struct tu_image_view *iview, - uint32_t gmem_offset, - bool resolve) -{ - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_INFO(.unk0 = !resolve, .gmem = !resolve)); - - const struct tu_native_format format = - tu6_format_color(iview->vk_format, iview->image->layout.tile_mode); - - enum a6xx_tile_mode tile_mode = - tu6_get_image_tile_mode(iview->image, iview->base_mip); - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_DST_INFO( - .tile_mode = tile_mode, - .samples = tu_msaa_samples(iview->image->samples), - .color_format = format.fmt, - .color_swap = format.swap, - .flags = iview->image->layout.ubwc_layer_size != 0), - A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)), - A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)), - A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size)); - - if (iview->image->layout.ubwc_layer_size) { - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)), - A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview))); - } - - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_BASE_GMEM(gmem_offset)); -} - -static void -tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - tu6_emit_event_write(cmd, cs, BLIT, false); -} - -static void -tu6_emit_window_scissor(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, +void +tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, @@ -740,11 +625,8 @@ tu6_emit_window_scissor(struct tu_cmd_buffer *cmd, A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2)); } -static void -tu6_emit_window_offset(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t x1, - uint32_t y1) +void +tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1) { tu_cs_emit_regs(cs, A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1)); @@ -783,6 +665,9 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd) if (!cmd->state.pass->gmem_pixels) return true; + if (cmd->state.framebuffer->layers > 1) + return true; + return cmd->state.tiling_config.force_sysmem; } @@ -801,8 +686,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, const uint32_t y1 = tile->begin.y; const uint32_t x2 = tile->end.x - 1; const uint32_t y2 = tile->end.y - 1; - tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2); - tu6_emit_window_offset(cmd, cs, x1, y1); + tu6_emit_window_scissor(cs, x1, y1, x2, y2); + tu6_emit_window_offset(cs, x1, y1); tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable = false)); @@ -861,93 +746,6 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, } } -static void -tu6_emit_load_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a) -{ - const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_image_view *iview = fb->attachments[a].attachment; - const struct tu_render_pass_attachment *attachment = - &cmd->state.pass->attachments[a]; - - if (attachment->gmem_offset < 0) - return; - - const uint32_t x1 = tiling->render_area.offset.x; - const uint32_t y1 = tiling->render_area.offset.y; - const uint32_t x2 = x1 + tiling->render_area.extent.width; - const uint32_t y2 = y1 + tiling->render_area.extent.height; - const uint32_t tile_x2 = - tiling->tile0.offset.x + tiling->tile0.extent.width * tiling->tile_count.width; - const uint32_t tile_y2 = - tiling->tile0.offset.y + tiling->tile0.extent.height * tiling->tile_count.height; - bool need_load = - x1 != tiling->tile0.offset.x || x2 != MIN2(fb->width, tile_x2) || - y1 != tiling->tile0.offset.y || y2 != MIN2(fb->height, tile_y2); - - if (need_load) - tu_finishme("improve handling of unaligned render area"); - - if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) - need_load = true; - - if (vk_format_has_stencil(iview->vk_format) && - attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) - need_load = true; - - if (need_load) { - tu6_emit_blit_info(cmd, cs, iview, attachment->gmem_offset, false); - tu6_emit_blit(cmd, cs); - } -} - -static void -tu6_emit_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - uint32_t a, - const VkRenderPassBeginInfo *info) -{ - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_image_view *iview = fb->attachments[a].attachment; - const struct tu_render_pass_attachment *attachment = - &cmd->state.pass->attachments[a]; - unsigned clear_mask = 0; - - /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */ - if (attachment->gmem_offset < 0) - return; - - if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) - clear_mask = 0xf; - - if (vk_format_has_stencil(iview->vk_format)) { - clear_mask &= 0x1; - if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) - clear_mask |= 0x2; - } - if (!clear_mask) - return; - - tu_clear_gmem_attachment(cmd, cs, a, clear_mask, - &info->pClearValues[a]); -} - -static void -tu6_emit_predicated_blit(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - uint32_t gmem_a, - bool resolve) -{ - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); - - tu6_emit_blit_info(cmd, cs, - cmd->state.framebuffer->attachments[a].attachment, - cmd->state.pass->attachments[gmem_a].gmem_offset, resolve); - tu6_emit_blit(cmd, cs); - - tu_cond_exec_end(cs); -} - static void tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -955,48 +753,10 @@ tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd, uint32_t gmem_a) { const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_image_view *dst = fb->attachments[a].attachment; - const struct tu_image_view *src = fb->attachments[gmem_a].attachment; - - tu_blit(cmd, cs, &(struct tu_blit) { - .dst = sysmem_attachment_surf(dst, dst->base_layer, - &cmd->state.tiling_config.render_area), - .src = sysmem_attachment_surf(src, src->base_layer, - &cmd->state.tiling_config.render_area), - .layers = fb->layers, - }); -} - + struct tu_image_view *dst = fb->attachments[a].attachment; + struct tu_image_view *src = fb->attachments[gmem_a].attachment; -/* Emit a MSAA resolve operation, with both gmem and sysmem paths. */ -static void tu6_emit_resolve(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - uint32_t gmem_a) -{ - if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE) - return; - - tu6_emit_predicated_blit(cmd, cs, a, gmem_a, true); - - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); - tu6_emit_sysmem_resolve(cmd, cs, a, gmem_a); - tu_cond_exec_end(cs); -} - -static void -tu6_emit_store_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - uint32_t gmem_a) -{ - if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE) - return; - - tu6_emit_blit_info(cmd, cs, - cmd->state.framebuffer->attachments[a].attachment, - cmd->state.pass->attachments[gmem_a].gmem_offset, true); - tu6_emit_blit(cmd, cs); + tu_resolve_sysmem(cmd, cs, src, dst, fb->layers, &cmd->state.tiling_config.render_area); } static void @@ -1018,19 +778,20 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE)); - tu6_emit_blit_scissor(cmd, cs, true); + /* blit scissor may have been changed by CmdClearAttachments */ + tu6_emit_blit_scissor(cmd, cs, false); for (uint32_t a = 0; a < pass->attachment_count; ++a) { if (pass->attachments[a].gmem_offset >= 0) - tu6_emit_store_attachment(cmd, cs, a, a); + tu_store_gmem_attachment(cmd, cs, a, a); } if (subpass->resolve_attachments) { for (unsigned i = 0; i < subpass->color_count; i++) { uint32_t a = subpass->resolve_attachments[i].attachment; if (a != VK_ATTACHMENT_UNUSED) - tu6_emit_store_attachment(cmd, cs, a, - subpass->color_attachments[i].attachment); + tu_store_gmem_attachment(cmd, cs, a, + subpass->color_attachments[i].attachment); } } } @@ -1331,7 +1092,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1; uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1; - tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2); + tu6_emit_window_scissor(cs, x1, y1, x2, y2); tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING)); @@ -1395,44 +1156,6 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) cmd->wait_for_idle = false; } -static void -tu_emit_sysmem_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - uint32_t a, - const VkRenderPassBeginInfo *info) -{ - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_image_view *iview = fb->attachments[a].attachment; - const struct tu_render_pass_attachment *attachment = - &cmd->state.pass->attachments[a]; - unsigned clear_mask = 0; - - /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */ - if (attachment->gmem_offset < 0) - return; - - if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { - clear_mask = 0xf; - } - - if (vk_format_has_stencil(iview->vk_format)) { - clear_mask &= 0x1; - if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) - clear_mask |= 0x2; - if (clear_mask != 0x3) - tu_finishme("depth/stencil only load op"); - } - - if (!clear_mask) - return; - - tu_clear_sysmem_attachment(cmd, cs, a, - &info->pClearValues[a], &(struct VkClearRect) { - .rect = info->renderArea, - .baseArrayLayer = iview->base_layer, - .layerCount = iview->layer_count, - }); -} - static void tu_emit_load_clear(struct tu_cmd_buffer *cmd, const VkRenderPassBeginInfo *info) @@ -1444,26 +1167,19 @@ tu_emit_load_clear(struct tu_cmd_buffer *cmd, tu6_emit_blit_scissor(cmd, cs, true); for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu6_emit_load_attachment(cmd, cs, i); + tu_load_gmem_attachment(cmd, cs, i); tu6_emit_blit_scissor(cmd, cs, false); for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu6_emit_clear_attachment(cmd, cs, i, info); + tu_clear_gmem_attachment(cmd, cs, i, info); tu_cond_exec_end(cs); - /* invalidate because reading input attachments will cache GMEM and - * the cache isn''t updated when GMEM is written - * TODO: is there a no-cache bit for textures? - */ - if (cmd->state.subpass->input_count) - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_emit_sysmem_clear_attachment(cmd, cs, i, info); + tu_clear_sysmem_attachment(cmd, cs, i, info); tu_cond_exec_end(cs); } @@ -1476,8 +1192,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_framebuffer *fb = cmd->state.framebuffer; assert(fb->width > 0 && fb->height > 0); - tu6_emit_window_scissor(cmd, cs, 0, 0, fb->width - 1, fb->height - 1); - tu6_emit_window_offset(cmd, cs, 0, 0); + tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); + tu6_emit_window_offset(cs, 0, 0); tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */ @@ -1516,7 +1232,6 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) /* Do any resolves of the last subpass. These are handled in the * tile_store_ib in the gmem path. */ - const struct tu_subpass *subpass = cmd->state.subpass; if (subpass->resolve_attachments) { for (unsigned i = 0; i < subpass->color_count; i++) { @@ -1555,7 +1270,13 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); - tu6_emit_wfi(cmd, cs); + /* TODO: flushing with barriers instead of blindly always flushing */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false); + + tu_cs_emit_wfi(cs); tu_cs_emit_regs(cs, A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_gmem, .gmem = 1)); @@ -1684,7 +1405,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd) static void tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd) { - const uint32_t tile_store_space = 32 + 23 * cmd->state.pass->attachment_count; + const uint32_t tile_store_space = 11 + (35 * 2) * cmd->state.pass->attachment_count; struct tu_cs sub_cs; VkResult result = @@ -1708,7 +1429,7 @@ tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd, struct tu_tiling_config *tiling = &cmd->state.tiling_config; tiling->render_area = *render_area; - tiling->force_sysmem = force_sysmem(cmd, render_area); + tiling->force_sysmem = false; tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass->gmem_pixels); tu_tiling_config_update_pipe_layout(tiling, dev); @@ -2583,7 +2304,7 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer, tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); - tu6_emit_msaa(cmd, cmd->state.subpass, &cmd->draw_cs); + tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples); tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false); /* note: use_hw_binning only checks tiling config */ @@ -2614,53 +2335,66 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) struct tu_cs *cs = &cmd->draw_cs; const struct tu_subpass *subpass = cmd->state.subpass++; - /* TODO: - * if msaa samples change between subpasses, - * attachment store is broken for some attachments - */ + + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); + if (subpass->resolve_attachments) { - tu6_emit_blit_scissor(cmd, cs, true); for (unsigned i = 0; i < subpass->color_count; i++) { uint32_t a = subpass->resolve_attachments[i].attachment; - if (a != VK_ATTACHMENT_UNUSED) { - tu6_emit_resolve(cmd, cs, a, - subpass->color_attachments[i].attachment); - } + if (a == VK_ATTACHMENT_UNUSED) + continue; + + tu_store_gmem_attachment(cmd, cs, a, + subpass->color_attachments[i].attachment); + + if (pass->attachments[a].gmem_offset < 0) + continue; + + /* TODO: + * check if the resolved attachment is needed by later subpasses, + * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM.. + */ + tu_finishme("missing GMEM->GMEM resolve path\n"); + tu_emit_load_gmem_attachment(cmd, cs, a); } } - /* invalidate because reading input attachments will cache GMEM and - * the cache isn''t updated when GMEM is written - * TODO: is there a no-cache bit for textures? - */ - if (cmd->state.subpass->input_count) - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + tu_cond_exec_end(cs); - /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */ - tu6_emit_zs(cmd, cmd->state.subpass, cs); - tu6_emit_mrt(cmd, cmd->state.subpass, cs); - tu6_emit_msaa(cmd, cmd->state.subpass, cs); - tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false); + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); - /* Emit flushes so that input attachments will read the correct value. This - * is for sysmem only, although it shouldn't do much harm on gmem. + /* Emit flushes so that input attachments will read the correct value. + * TODO: use subpass dependencies to flush or not */ tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true); - /* TODO: - * since we don't know how to do GMEM->GMEM resolve, - * resolve attachments are resolved to memory then loaded to GMEM again if needed - */ if (subpass->resolve_attachments) { + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + for (unsigned i = 0; i < subpass->color_count; i++) { uint32_t a = subpass->resolve_attachments[i].attachment; - if (a != VK_ATTACHMENT_UNUSED && pass->attachments[a].gmem_offset >= 0) { - tu_finishme("missing GMEM->GMEM resolve, performance will suffer\n"); - tu6_emit_predicated_blit(cmd, cs, a, a, false); - } + if (a == VK_ATTACHMENT_UNUSED) + continue; + + tu6_emit_sysmem_resolve(cmd, cs, a, + subpass->color_attachments[i].attachment); } + + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true); } + + tu_cond_exec_end(cs); + + /* subpass->input_count > 0 then texture cache invalidate is likely to be needed */ + if (cmd->state.subpass->input_count) + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + + /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */ + tu6_emit_zs(cmd, cmd->state.subpass, cs); + tu6_emit_mrt(cmd, cmd->state.subpass, cs); + tu6_emit_msaa(cs, cmd->state.subpass->samples); + tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false); } void @@ -4137,7 +3871,7 @@ struct tu_barrier_info }; static void -tu_barrier(struct tu_cmd_buffer *cmd_buffer, +tu_barrier(struct tu_cmd_buffer *cmd, uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount, @@ -4146,13 +3880,24 @@ tu_barrier(struct tu_cmd_buffer *cmd_buffer, const VkImageMemoryBarrier *pImageMemoryBarriers, const struct tu_barrier_info *info) { + /* renderpass case is only for subpass self-dependencies + * which means syncing the render output with texture cache + * note: only the CACHE_INVALIDATE is needed in GMEM mode + * and in sysmem mode we might not need either color/depth flush + */ + if (cmd->state.pass) { + tu6_emit_event_write(cmd, &cmd->draw_cs, PC_CCU_FLUSH_COLOR_TS, true); + tu6_emit_event_write(cmd, &cmd->draw_cs, PC_CCU_FLUSH_DEPTH_TS, true); + tu6_emit_event_write(cmd, &cmd->draw_cs, CACHE_INVALIDATE, false); + return; + } } void tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, - VkPipelineStageFlags destStageMask, - VkBool32 byRegion, + VkPipelineStageFlags dstStageMask, + VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount, diff --git a/src/freedreno/vulkan/tu_formats.c b/src/freedreno/vulkan/tu_formats.c index 6a8f3d022a2..528a4176f4a 100644 --- a/src/freedreno/vulkan/tu_formats.c +++ b/src/freedreno/vulkan/tu_formats.c @@ -28,10 +28,6 @@ #include "registers/adreno_common.xml.h" #include "registers/a6xx.xml.h" -#include "util/format_r11g11b10f.h" -#include "util/format_rgb9e5.h" -#include "util/format_srgb.h" -#include "util/u_half.h" #include "vk_format.h" #include "vk_util.h" #include "drm-uapi/drm_fourcc.h" @@ -222,13 +218,13 @@ static const struct tu_native_format tu6_format_table[] = { TU6_xTx(E5B9G9R9_UFLOAT_PACK32, 9_9_9_E5_FLOAT, WZYX), /* 123 */ /* depth/stencil */ - TU6_xTC(D16_UNORM, 16_UNORM, WZYX), /* 124 */ - TU6_xTC(X8_D24_UNORM_PACK32, Z24_UNORM_S8_UINT, WZYX), /* 125 */ - TU6_xTC(D32_SFLOAT, 32_FLOAT, WZYX), /* 126 */ - TU6_xTC(S8_UINT, 8_UINT, WZYX), /* 127 */ - TU6_xxx(D16_UNORM_S8_UINT, X8Z16_UNORM, WZYX), /* 128 */ - TU6_xTC(D24_UNORM_S8_UINT, Z24_UNORM_S8_UINT, WZYX), /* 129 */ - TU6_xxx(D32_SFLOAT_S8_UINT, x, WZYX), /* 130 */ + TU6_xTC(D16_UNORM, 16_UNORM, WZYX), /* 124 */ + TU6_xTC(X8_D24_UNORM_PACK32, Z24_UNORM_S8_UINT_AS_R8G8B8A8, WZYX), /* 125 */ + TU6_xTC(D32_SFLOAT, 32_FLOAT, WZYX), /* 126 */ + TU6_xTC(S8_UINT, 8_UINT, WZYX), /* 127 */ + TU6_xxx(D16_UNORM_S8_UINT, X8Z16_UNORM, WZYX), /* 128 */ + TU6_xTC(D24_UNORM_S8_UINT, Z24_UNORM_S8_UINT_AS_R8G8B8A8, WZYX), /* 129 */ + TU6_xxx(D32_SFLOAT_S8_UINT, x, WZYX), /* 130 */ /* compressed */ TU6_xTx(BC1_RGB_UNORM_BLOCK, DXT1, WZYX), /* 131 */ @@ -348,75 +344,6 @@ tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode) return fmt; } -enum a6xx_2d_ifmt -tu6_fmt_to_ifmt(enum a6xx_format fmt) -{ - switch (fmt) { - case FMT6_A8_UNORM: - case FMT6_8_UNORM: - case FMT6_8_SNORM: - case FMT6_8_8_UNORM: - case FMT6_8_8_SNORM: - case FMT6_8_8_8_8_UNORM: - case FMT6_8_8_8_X8_UNORM: - case FMT6_8_8_8_8_SNORM: - case FMT6_4_4_4_4_UNORM: - case FMT6_5_5_5_1_UNORM: - case FMT6_5_6_5_UNORM: - case FMT6_Z24_UNORM_S8_UINT: - case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: - return R2D_UNORM8; - - case FMT6_32_UINT: - case FMT6_32_SINT: - case FMT6_32_32_UINT: - case FMT6_32_32_SINT: - case FMT6_32_32_32_32_UINT: - case FMT6_32_32_32_32_SINT: - return R2D_INT32; - - case FMT6_16_UINT: - case FMT6_16_SINT: - case FMT6_16_16_UINT: - case FMT6_16_16_SINT: - case FMT6_16_16_16_16_UINT: - case FMT6_16_16_16_16_SINT: - case FMT6_10_10_10_2_UINT: - return R2D_INT16; - - case FMT6_8_UINT: - case FMT6_8_SINT: - case FMT6_8_8_UINT: - case FMT6_8_8_SINT: - case FMT6_8_8_8_8_UINT: - case FMT6_8_8_8_8_SINT: - return R2D_INT8; - - case FMT6_16_UNORM: - case FMT6_16_SNORM: - case FMT6_16_16_UNORM: - case FMT6_16_16_SNORM: - case FMT6_16_16_16_16_UNORM: - case FMT6_16_16_16_16_SNORM: - case FMT6_32_FLOAT: - case FMT6_32_32_FLOAT: - case FMT6_32_32_32_32_FLOAT: - return R2D_FLOAT32; - - case FMT6_16_FLOAT: - case FMT6_16_16_FLOAT: - case FMT6_16_16_16_16_FLOAT: - case FMT6_11_11_10_FLOAT: - case FMT6_10_10_10_2_UNORM: - case FMT6_10_10_10_2_UNORM_DEST: - return R2D_FLOAT16; - - default: - unreachable("bad format"); - return 0; - } -} - enum a6xx_depth_format tu6_pipe2depth(VkFormat format) { @@ -433,306 +360,6 @@ tu6_pipe2depth(VkFormat format) } } -static uint32_t -tu_pack_mask(int bits) -{ - assert(bits <= 32); - return (1ull << bits) - 1; -} - -static uint32_t -tu_pack_float32_for_unorm(float val, int bits) -{ - const uint32_t max = tu_pack_mask(bits); - if (val < 0.0f) - return 0; - else if (val > 1.0f) - return max; - else - return _mesa_lroundevenf(val * (float) max); -} - -static uint32_t -tu_pack_float32_for_snorm(float val, int bits) -{ - const int32_t max = tu_pack_mask(bits - 1); - int32_t tmp; - if (val < -1.0f) - tmp = -max; - else if (val > 1.0f) - tmp = max; - else - tmp = _mesa_lroundevenf(val * (float) max); - - return tmp & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_float32_for_uscaled(float val, int bits) -{ - const uint32_t max = tu_pack_mask(bits); - if (val < 0.0f) - return 0; - else if (val > (float) max) - return max; - else - return (uint32_t) val; -} - -static uint32_t -tu_pack_float32_for_sscaled(float val, int bits) -{ - const int32_t max = tu_pack_mask(bits - 1); - const int32_t min = -max - 1; - int32_t tmp; - if (val < (float) min) - tmp = min; - else if (val > (float) max) - tmp = max; - else - tmp = (int32_t) val; - - return tmp & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_uint32_for_uint(uint32_t val, int bits) -{ - return val & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_int32_for_sint(int32_t val, int bits) -{ - return val & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_float32_for_sfloat(float val, int bits) -{ - assert(bits == 16 || bits == 32); - return bits == 16 ? util_float_to_half(val) : fui(val); -} - -union tu_clear_component_value { - float float32; - int32_t int32; - uint32_t uint32; -}; - -static uint32_t -tu_pack_clear_component_value(union tu_clear_component_value val, - const struct util_format_channel_description *ch) -{ - uint32_t packed; - - switch (ch->type) { - case UTIL_FORMAT_TYPE_UNSIGNED: - /* normalized, scaled, or pure integer */ - if (ch->normalized) - packed = tu_pack_float32_for_unorm(val.float32, ch->size); - else if (ch->pure_integer) - packed = tu_pack_uint32_for_uint(val.uint32, ch->size); - else - packed = tu_pack_float32_for_uscaled(val.float32, ch->size); - break; - case UTIL_FORMAT_TYPE_SIGNED: - /* normalized, scaled, or pure integer */ - if (ch->normalized) - packed = tu_pack_float32_for_snorm(val.float32, ch->size); - else if (ch->pure_integer) - packed = tu_pack_int32_for_sint(val.int32, ch->size); - else - packed = tu_pack_float32_for_sscaled(val.float32, ch->size); - break; - case UTIL_FORMAT_TYPE_FLOAT: - packed = tu_pack_float32_for_sfloat(val.float32, ch->size); - break; - default: - unreachable("unexpected channel type"); - packed = 0; - break; - } - - assert((packed & tu_pack_mask(ch->size)) == packed); - return packed; -} - -static const struct util_format_channel_description * -tu_get_format_channel_description(const struct util_format_description *desc, - int comp) -{ - switch (desc->swizzle[comp]) { - case PIPE_SWIZZLE_X: - return &desc->channel[0]; - case PIPE_SWIZZLE_Y: - return &desc->channel[1]; - case PIPE_SWIZZLE_Z: - return &desc->channel[2]; - case PIPE_SWIZZLE_W: - return &desc->channel[3]; - default: - return NULL; - } -} - -static union tu_clear_component_value -tu_get_clear_component_value(const VkClearValue *val, int comp, - enum util_format_colorspace colorspace) -{ - assert(comp < 4); - - union tu_clear_component_value tmp; - switch (colorspace) { - case UTIL_FORMAT_COLORSPACE_ZS: - assert(comp < 2); - if (comp == 0) - tmp.float32 = val->depthStencil.depth; - else - tmp.uint32 = val->depthStencil.stencil; - break; - case UTIL_FORMAT_COLORSPACE_SRGB: - if (comp < 3) { - tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]); - break; - } - default: - assert(comp < 4); - tmp.uint32 = val->color.uint32[comp]; - break; - } - - return tmp; -} - -/** - * Pack a VkClearValue into a 128-bit buffer. \a format is respected except - * for the component order. The components are always packed in WZYX order - * (i.e., msb is white and lsb is red). - * - * Return the number of uint32_t's used. - */ -void -tu_pack_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4]) -{ - const struct util_format_description *desc = vk_format_description(format); - - switch (format) { - case VK_FORMAT_B10G11R11_UFLOAT_PACK32: - buf[0] = float3_to_r11g11b10f(val->color.float32); - return; - case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: - buf[0] = float3_to_rgb9e5(val->color.float32); - return; - default: - break; - } - - assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); - - /* S8_UINT is special and has no depth */ - const int max_components = - format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels; - - int buf_offset = 0; - int bit_shift = 0; - for (int comp = 0; comp < max_components; comp++) { - const struct util_format_channel_description *ch = - tu_get_format_channel_description(desc, comp); - if (!ch) { - assert((format == VK_FORMAT_S8_UINT && comp == 0) || - (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1)); - continue; - } - - union tu_clear_component_value v = tu_get_clear_component_value( - val, comp, desc->colorspace); - - /* move to the next uint32_t when there is not enough space */ - assert(ch->size <= 32); - if (bit_shift + ch->size > 32) { - buf_offset++; - bit_shift = 0; - } - - if (bit_shift == 0) - buf[buf_offset] = 0; - - buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift; - bit_shift += ch->size; - } -} - -void -tu_2d_clear_color(const VkClearColorValue *val, VkFormat format, uint32_t buf[4]) -{ - const struct util_format_description *desc = vk_format_description(format); - - /* not supported by 2D engine, cleared as U32 */ - if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) { - buf[0] = float3_to_rgb9e5(val->float32); - return; - } - - enum a6xx_2d_ifmt ifmt = tu6_fmt_to_ifmt(tu6_get_native_format(format).fmt); - - assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN || - format == VK_FORMAT_B10G11R11_UFLOAT_PACK32)); - - for (unsigned i = 0; i < desc->nr_channels; i++) { - const struct util_format_channel_description *ch = &desc->channel[i]; - - switch (ifmt) { - case R2D_INT32: - case R2D_INT16: - case R2D_INT8: - case R2D_FLOAT32: - buf[i] = val->uint32[i]; - break; - case R2D_FLOAT16: - buf[i] = util_float_to_half(val->float32[i]); - break; - case R2D_UNORM8: { - float linear = val->float32[i]; - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3) - linear = util_format_linear_to_srgb_float(val->float32[i]); - - if (ch->type == UTIL_FORMAT_TYPE_SIGNED) - buf[i] = tu_pack_float32_for_snorm(linear, 8); - else - buf[i] = tu_pack_float32_for_unorm(linear, 8); - } break; - default: - unreachable("unexpected ifmt"); - break; - } - } -} - -void -tu_2d_clear_zs(const VkClearDepthStencilValue *val, VkFormat format, uint32_t buf[4]) -{ - switch (format) { - case VK_FORMAT_X8_D24_UNORM_PACK32: - case VK_FORMAT_D24_UNORM_S8_UINT: - buf[0] = tu_pack_float32_for_unorm(val->depth, 24); - buf[1] = buf[0] >> 8; - buf[2] = buf[0] >> 16; - buf[3] = val->stencil; - return; - case VK_FORMAT_D16_UNORM: - case VK_FORMAT_D32_SFLOAT: - buf[0] = fui(val->depth); - return; - case VK_FORMAT_S8_UINT: - buf[0] = val->stencil; - return; - default: - unreachable("unexpected zs format"); - break; - } -} - static void tu_physical_device_get_format_properties( struct tu_physical_device *physical_device, diff --git a/src/freedreno/vulkan/tu_image.c b/src/freedreno/vulkan/tu_image.c index ddb8dbebf7b..4e2b8692605 100644 --- a/src/freedreno/vulkan/tu_image.c +++ b/src/freedreno/vulkan/tu_image.c @@ -111,13 +111,6 @@ tu_image_create(VkDevice _device, ubwc_enabled = false; } - /* using UBWC with D24S8 breaks the "stencil read" copy path (why?) - * (causes any deqp tests that need to check stencil to fail) - * disable UBWC for this format until we properly support copy aspect masks - */ - if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) - ubwc_enabled = false; - /* UBWC can't be used with E5B9G9R9 */ if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) ubwc_enabled = false; @@ -166,7 +159,7 @@ tu_image_create(VkDevice _device, return VK_SUCCESS; } -static enum a6xx_tex_fetchsize +enum a6xx_tex_fetchsize tu6_fetchsize(VkFormat format) { if (vk_format_description(format)->layout == UTIL_FORMAT_LAYOUT_ASTC) @@ -277,24 +270,27 @@ tu_image_view_init(struct tu_image_view *iview, memset(iview->descriptor, 0, sizeof(iview->descriptor)); struct tu_native_format fmt = - tu6_format_texture(iview->vk_format, image->layout.tile_mode); + tu6_format_image_src(image, iview->vk_format, iview->base_mip); uint64_t base_addr = tu_image_base(image, iview->base_mip, iview->base_layer); uint64_t ubwc_addr = tu_image_ubwc_base(image, iview->base_mip, iview->base_layer); - uint32_t pitch = tu_image_stride(image, iview->base_mip) / vk_format_get_blockwidth(iview->vk_format); - enum a6xx_tile_mode tile_mode = tu6_get_image_tile_mode(image, iview->base_mip); + uint32_t pitch = tu_image_pitch(image, iview->base_mip); uint32_t width = iview->extent.width; uint32_t height = iview->extent.height; uint32_t depth = pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count; unsigned fmt_tex = fmt.fmt; - if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && - iview->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) - fmt_tex = FMT6_S8Z24_UINT; + if (fmt_tex == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) { + if (iview->aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) + fmt_tex = FMT6_Z24_UNORM_S8_UINT; + if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) + fmt_tex = FMT6_S8Z24_UINT; + /* TODO: also use this format with storage descriptor ? */ + } iview->descriptor[0] = - A6XX_TEX_CONST_0_TILE_MODE(tile_mode) | + A6XX_TEX_CONST_0_TILE_MODE(fmt.tile_mode) | COND(vk_format_is_srgb(iview->vk_format), A6XX_TEX_CONST_0_SRGB) | A6XX_TEX_CONST_0_FMT(fmt_tex) | A6XX_TEX_CONST_0_SAMPLES(tu_msaa_samples(image->samples)) | @@ -335,7 +331,7 @@ tu_image_view_init(struct tu_image_view *iview, iview->storage_descriptor[0] = A6XX_IBO_0_FMT(fmt.fmt) | - A6XX_IBO_0_TILE_MODE(tile_mode); + A6XX_IBO_0_TILE_MODE(fmt.tile_mode); iview->storage_descriptor[1] = A6XX_IBO_1_WIDTH(width) | A6XX_IBO_1_HEIGHT(height); diff --git a/src/freedreno/vulkan/tu_meta_blit.c b/src/freedreno/vulkan/tu_meta_blit.c deleted file mode 100644 index b45309336ed..00000000000 --- a/src/freedreno/vulkan/tu_meta_blit.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#include "tu_private.h" - -#include "tu_blit.h" - -static void -tu_blit_image(struct tu_cmd_buffer *cmdbuf, - struct tu_image *src_image, - struct tu_image *dst_image, - const VkImageBlit *info, - VkFilter filter) -{ - static const enum a6xx_rotation rotate[2][2] = { - {ROTATE_0, ROTATE_HFLIP}, - {ROTATE_VFLIP, ROTATE_180}, - }; - bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) != - (info->dstOffsets[1].x < info->dstOffsets[0].x); - bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) != - (info->dstOffsets[1].y < info->dstOffsets[0].y); - bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) != - (info->dstOffsets[1].z < info->dstOffsets[0].z); - - if (mirror_z) { - tu_finishme("blit z mirror\n"); - return; - } - - if (info->srcOffsets[1].z - info->srcOffsets[0].z != - info->dstOffsets[1].z - info->dstOffsets[0].z) { - tu_finishme("blit z filter\n"); - return; - } - assert(info->dstSubresource.layerCount == info->srcSubresource.layerCount); - - struct tu_blit blt = { - .dst = tu_blit_surf(dst_image, info->dstSubresource, info->dstOffsets), - .src = tu_blit_surf(src_image, info->srcSubresource, info->srcOffsets), - .layers = MAX2(info->srcOffsets[1].z - info->srcOffsets[0].z, - info->dstSubresource.layerCount), - .filter = filter == VK_FILTER_LINEAR, - .rotation = rotate[mirror_y][mirror_x], - }; - - tu_blit(cmdbuf, &cmdbuf->cs, &blt); -} - -void -tu_CmdBlitImage(VkCommandBuffer commandBuffer, - VkImage srcImage, - VkImageLayout srcImageLayout, - VkImage destImage, - VkImageLayout destImageLayout, - uint32_t regionCount, - const VkImageBlit *pRegions, - VkFilter filter) - -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_image, src_image, srcImage); - TU_FROM_HANDLE(tu_image, dst_image, destImage); - - tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); - - for (uint32_t i = 0; i < regionCount; ++i) { - tu_blit_image(cmdbuf, src_image, dst_image, pRegions + i, filter); - } -} diff --git a/src/freedreno/vulkan/tu_meta_buffer.c b/src/freedreno/vulkan/tu_meta_buffer.c deleted file mode 100644 index fd82b9cdf8d..00000000000 --- a/src/freedreno/vulkan/tu_meta_buffer.c +++ /dev/null @@ -1,75 +0,0 @@ -#include "tu_private.h" -#include "tu_blit.h" -#include "tu_cs.h" - -void -tu_CmdFillBuffer(VkCommandBuffer commandBuffer, - VkBuffer dstBuffer, - VkDeviceSize dstOffset, - VkDeviceSize fillSize, - uint32_t data) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); - - if (fillSize == VK_WHOLE_SIZE) - fillSize = buffer->size - dstOffset; - - tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); - - tu_blit(cmd, &cmd->cs, &(struct tu_blit) { - .dst = { - .fmt = VK_FORMAT_R32_UINT, - .va = tu_buffer_iova(buffer) + dstOffset, - .width = fillSize / 4, - .height = 1, - .samples = 1, - }, - .layers = 1, - .clear_value[0] = data, - .type = TU_BLIT_CLEAR, - .buffer = true, - }); -} - -void -tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, - VkBuffer dstBuffer, - VkDeviceSize dstOffset, - VkDeviceSize dataSize, - const void *pData) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); - - tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); - - struct ts_cs_memory tmp; - VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp); - if (result != VK_SUCCESS) { - cmd->record_result = result; - return; - } - - memcpy(tmp.map, pData, dataSize); - - tu_blit(cmd, &cmd->cs, &(struct tu_blit) { - .dst = { - .fmt = VK_FORMAT_R32_UINT, - .va = tu_buffer_iova(buffer) + dstOffset, - .width = dataSize / 4, - .height = 1, - .samples = 1, - }, - .src = { - .fmt = VK_FORMAT_R32_UINT, - .va = tmp.iova, - .width = dataSize / 4, - .height = 1, - .samples = 1, - }, - .layers = 1, - .type = TU_BLIT_COPY, - .buffer = true, - }); -} diff --git a/src/freedreno/vulkan/tu_meta_clear.c b/src/freedreno/vulkan/tu_meta_clear.c deleted file mode 100644 index 500f6ae3529..00000000000 --- a/src/freedreno/vulkan/tu_meta_clear.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#include "tu_private.h" -#include "tu_blit.h" -#include "tu_cs.h" - -static void -clear_image(struct tu_cmd_buffer *cmdbuf, - struct tu_image *image, - uint32_t clear_value[4], - const VkImageSubresourceRange *range) -{ - uint32_t level_count = tu_get_levelCount(image, range); - uint32_t layer_count = tu_get_layerCount(image, range); - - if (image->type == VK_IMAGE_TYPE_3D) { - assert(layer_count == 1); - assert(range->baseArrayLayer == 0); - } - - for (unsigned j = 0; j < level_count; j++) { - if (image->type == VK_IMAGE_TYPE_3D) - layer_count = u_minify(image->extent.depth, range->baseMipLevel + j); - - tu_blit(cmdbuf, &cmdbuf->cs, &(struct tu_blit) { - .dst = tu_blit_surf_whole(image, range->baseMipLevel + j, range->baseArrayLayer), - .layers = layer_count, - .clear_value = {clear_value[0], clear_value[1], clear_value[2], clear_value[3]}, - .type = TU_BLIT_CLEAR, - }); - } -} - -void -tu_CmdClearColorImage(VkCommandBuffer commandBuffer, - VkImage image_h, - VkImageLayout imageLayout, - const VkClearColorValue *pColor, - uint32_t rangeCount, - const VkImageSubresourceRange *pRanges) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_image, image, image_h); - uint32_t clear_value[4] = {}; - - tu_2d_clear_color(pColor, image->vk_format, clear_value); - - tu_bo_list_add(&cmdbuf->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); - - for (unsigned i = 0; i < rangeCount; i++) - clear_image(cmdbuf, image, clear_value, pRanges + i); -} - -void -tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, - VkImage image_h, - VkImageLayout imageLayout, - const VkClearDepthStencilValue *pDepthStencil, - uint32_t rangeCount, - const VkImageSubresourceRange *pRanges) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_image, image, image_h); - uint32_t clear_value[4] = {}; - - tu_2d_clear_zs(pDepthStencil, image->vk_format, clear_value); - - tu_bo_list_add(&cmdbuf->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); - - for (unsigned i = 0; i < rangeCount; i++) - clear_image(cmdbuf, image, clear_value, pRanges + i); -} - -void -tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t attachment, - const VkClearValue *value, - const VkClearRect *rect) -{ - if (!cmd->state.framebuffer) { - tu_finishme("sysmem CmdClearAttachments in secondary command buffer"); - return; - } - - const struct tu_image_view *iview = - cmd->state.framebuffer->attachments[attachment].attachment; - - uint32_t clear_vals[4] = { 0 }; - if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | - VK_IMAGE_ASPECT_STENCIL_BIT)) { - tu_2d_clear_zs(&value->depthStencil, iview->vk_format, - clear_vals); - } else { - tu_2d_clear_color(&value->color, iview->vk_format, - clear_vals); - } - - tu_blit(cmd, cs, &(struct tu_blit) { - .dst = sysmem_attachment_surf(iview, rect->baseArrayLayer, &rect->rect), - .layers = rect->layerCount, - .clear_value = { clear_vals[0], clear_vals[1], clear_vals[2], clear_vals[3] }, - .type = TU_BLIT_CLEAR, - }); -} - -void -tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t attachment, - uint8_t component_mask, - const VkClearValue *value) -{ - VkFormat fmt = cmd->state.pass->attachments[attachment].format; - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); - tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(fmt))); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); - tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); - tu_cs_emit(cs, 0); - - uint32_t clear_vals[4] = { 0 }; - tu_pack_clear_value(value, fmt, clear_vals); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); - tu_cs_emit(cs, clear_vals[0]); - tu_cs_emit(cs, clear_vals[1]); - tu_cs_emit(cs, clear_vals[2]); - tu_cs_emit(cs, clear_vals[3]); - - tu6_emit_event_write(cmd, cs, BLIT, false); -} - -void -tu_CmdClearAttachments(VkCommandBuffer commandBuffer, - uint32_t attachmentCount, - const VkClearAttachment *pAttachments, - uint32_t rectCount, - const VkClearRect *pRects) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - const struct tu_subpass *subpass = cmd->state.subpass; - struct tu_cs *cs = &cmd->draw_cs; - - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); - - for (unsigned i = 0; i < rectCount; i++) { - unsigned x1 = pRects[i].rect.offset.x; - unsigned y1 = pRects[i].rect.offset.y; - unsigned x2 = x1 + pRects[i].rect.extent.width - 1; - unsigned y2 = y1 + pRects[i].rect.extent.height - 1; - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); - tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1)); - tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2)); - - for (unsigned j = 0; j < attachmentCount; j++) { - uint32_t a; - unsigned clear_mask = 0; - if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { - clear_mask = 0xf; - a = subpass->color_attachments[pAttachments[j].colorAttachment].attachment; - } else { - a = subpass->depth_stencil_attachment.attachment; - if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) - clear_mask |= 1; - if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) - clear_mask |= 2; - } - - if (a == VK_ATTACHMENT_UNUSED) - continue; - - tu_clear_gmem_attachment(cmd, cs, a, clear_mask, - &pAttachments[j].clearValue); - - } - } - - tu_cond_exec_end(cs); - - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); - - for (unsigned i = 0; i < rectCount; i++) { - for (unsigned j = 0; j < attachmentCount; j++) { - uint32_t a; - unsigned clear_mask = 0; - if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { - clear_mask = 0xf; - a = subpass->color_attachments[pAttachments[j].colorAttachment].attachment; - } else { - a = subpass->depth_stencil_attachment.attachment; - if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) - clear_mask |= 1; - if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) - clear_mask |= 2; - if (clear_mask != 3) - tu_finishme("sysmem depth/stencil only clears"); - } - - if (a == VK_ATTACHMENT_UNUSED) - continue; - - tu_clear_sysmem_attachment(cmd, cs, a, - &pAttachments[j].clearValue, - &pRects[i]); - } - } - - tu_cond_exec_end(cs); -} diff --git a/src/freedreno/vulkan/tu_meta_copy.c b/src/freedreno/vulkan/tu_meta_copy.c deleted file mode 100644 index cf7a38c64ec..00000000000 --- a/src/freedreno/vulkan/tu_meta_copy.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright © 2016 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#include "tu_private.h" - -#include "a6xx.xml.h" -#include "adreno_common.xml.h" -#include "adreno_pm4.xml.h" - -#include "vk_format.h" - -#include "tu_cs.h" -#include "tu_blit.h" - -static void -tu_copy_buffer(struct tu_cmd_buffer *cmd, - struct tu_buffer *src, - struct tu_buffer *dst, - const VkBufferCopy *region) -{ - tu_bo_list_add(&cmd->bo_list, src->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmd->bo_list, dst->bo, MSM_SUBMIT_BO_WRITE); - - tu_blit(cmd, &cmd->cs, &(struct tu_blit) { - .dst = { - .fmt = VK_FORMAT_R8_UNORM, - .va = tu_buffer_iova(dst) + region->dstOffset, - .width = region->size, - .height = 1, - .samples = 1, - }, - .src = { - .fmt = VK_FORMAT_R8_UNORM, - .va = tu_buffer_iova(src) + region->srcOffset, - .width = region->size, - .height = 1, - .samples = 1, - }, - .layers = 1, - .type = TU_BLIT_COPY, - .buffer = true, - }); -} - -static struct tu_blit_surf -tu_blit_buffer(struct tu_buffer *buffer, - VkFormat format, - const VkBufferImageCopy *info) -{ - if (info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) - format = VK_FORMAT_R8_UNORM; - - unsigned pitch = (info->bufferRowLength ?: info->imageExtent.width) * - vk_format_get_blocksize(format); - - return (struct tu_blit_surf) { - .fmt = format, - .tile_mode = TILE6_LINEAR, - .va = tu_buffer_iova(buffer) + info->bufferOffset, - .pitch = pitch, - .layer_size = (info->bufferImageHeight ?: info->imageExtent.height) * pitch / vk_format_get_blockwidth(format) / vk_format_get_blockheight(format), - .width = info->imageExtent.width, - .height = info->imageExtent.height, - .samples = 1, - }; -} - -static void -tu_copy_buffer_to_image(struct tu_cmd_buffer *cmdbuf, - struct tu_buffer *src_buffer, - struct tu_image *dst_image, - const VkBufferImageCopy *info) -{ - if (info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT && - vk_format_get_blocksize(dst_image->vk_format) == 4) { - tu_finishme("aspect mask\n"); - return; - } - - tu_blit(cmdbuf, &cmdbuf->cs, &(struct tu_blit) { - .dst = tu_blit_surf_ext(dst_image, info->imageSubresource, info->imageOffset, info->imageExtent), - .src = tu_blit_buffer(src_buffer, dst_image->vk_format, info), - .layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount), - .type = TU_BLIT_COPY, - }); -} - -static void -tu_copy_image_to_buffer(struct tu_cmd_buffer *cmdbuf, - struct tu_image *src_image, - struct tu_buffer *dst_buffer, - const VkBufferImageCopy *info) -{ - tu_blit(cmdbuf, &cmdbuf->cs, &(struct tu_blit) { - .dst = tu_blit_buffer(dst_buffer, src_image->vk_format, info), - .src = tu_blit_surf_ext(src_image, info->imageSubresource, info->imageOffset, info->imageExtent), - .layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount), - .type = TU_BLIT_COPY, - }); -} - -static void -tu_copy_image_to_image(struct tu_cmd_buffer *cmdbuf, - struct tu_image *src_image, - struct tu_image *dst_image, - const VkImageCopy *info) -{ - if ((info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT && - vk_format_get_blocksize(dst_image->vk_format) == 4) || - (info->srcSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT && - vk_format_get_blocksize(src_image->vk_format) == 4)) { - tu_finishme("aspect mask\n"); - return; - } - - tu_blit(cmdbuf, &cmdbuf->cs, &(struct tu_blit) { - .dst = tu_blit_surf_ext(dst_image, info->dstSubresource, info->dstOffset, info->extent), - .src = tu_blit_surf_ext(src_image, info->srcSubresource, info->srcOffset, info->extent), - .layers = info->extent.depth, - .type = TU_BLIT_COPY, - }); -} - -void -tu_CmdCopyBuffer(VkCommandBuffer commandBuffer, - VkBuffer srcBuffer, - VkBuffer destBuffer, - uint32_t regionCount, - const VkBufferCopy *pRegions) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); - TU_FROM_HANDLE(tu_buffer, dst_buffer, destBuffer); - - for (unsigned i = 0; i < regionCount; ++i) - tu_copy_buffer(cmdbuf, src_buffer, dst_buffer, &pRegions[i]); -} - -void -tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer, - VkBuffer srcBuffer, - VkImage destImage, - VkImageLayout destImageLayout, - uint32_t regionCount, - const VkBufferImageCopy *pRegions) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_image, dst_image, destImage); - TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); - - tu_bo_list_add(&cmdbuf->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); - - for (unsigned i = 0; i < regionCount; ++i) - tu_copy_buffer_to_image(cmdbuf, src_buffer, dst_image, pRegions + i); -} - -void -tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer, - VkImage srcImage, - VkImageLayout srcImageLayout, - VkBuffer destBuffer, - uint32_t regionCount, - const VkBufferImageCopy *pRegions) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_image, src_image, srcImage); - TU_FROM_HANDLE(tu_buffer, dst_buffer, destBuffer); - - tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE); - - for (unsigned i = 0; i < regionCount; ++i) - tu_copy_image_to_buffer(cmdbuf, src_image, dst_buffer, pRegions + i); -} - -void -tu_CmdCopyImage(VkCommandBuffer commandBuffer, - VkImage srcImage, - VkImageLayout srcImageLayout, - VkImage destImage, - VkImageLayout destImageLayout, - uint32_t regionCount, - const VkImageCopy *pRegions) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_image, src_image, srcImage); - TU_FROM_HANDLE(tu_image, dst_image, destImage); - - tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); - - for (uint32_t i = 0; i < regionCount; ++i) - tu_copy_image_to_image(cmdbuf, src_image, dst_image, pRegions + i); -} diff --git a/src/freedreno/vulkan/tu_meta_resolve.c b/src/freedreno/vulkan/tu_meta_resolve.c deleted file mode 100644 index aac4dbf00c2..00000000000 --- a/src/freedreno/vulkan/tu_meta_resolve.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright © 2016 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#include "tu_private.h" - -#include -#include - -#include "nir/nir_builder.h" -#include "vk_format.h" - -#include "tu_blit.h" - -static void -tu_resolve_image(struct tu_cmd_buffer *cmdbuf, - struct tu_image *src_image, - struct tu_image *dst_image, - const VkImageResolve *info) -{ - assert(info->dstSubresource.layerCount == info->srcSubresource.layerCount); - - tu_blit(cmdbuf, &cmdbuf->cs, &(struct tu_blit) { - .dst = tu_blit_surf_ext(dst_image, info->dstSubresource, info->dstOffset, info->extent), - .src = tu_blit_surf_ext(src_image, info->srcSubresource, info->srcOffset, info->extent), - .layers = MAX2(info->extent.depth, info->dstSubresource.layerCount) - }); -} - -void -tu_CmdResolveImage(VkCommandBuffer cmd_buffer_h, - VkImage src_image_h, - VkImageLayout src_image_layout, - VkImage dest_image_h, - VkImageLayout dest_image_layout, - uint32_t region_count, - const VkImageResolve *regions) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, cmd_buffer_h); - TU_FROM_HANDLE(tu_image, src_image, src_image_h); - TU_FROM_HANDLE(tu_image, dst_image, dest_image_h); - - tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); - - for (uint32_t i = 0; i < region_count; ++i) - tu_resolve_image(cmdbuf, src_image, dst_image, regions + i); -} diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c index e8d47b8f20b..4f075e173e2 100644 --- a/src/freedreno/vulkan/tu_pass.c +++ b/src/freedreno/vulkan/tu_pass.c @@ -39,7 +39,8 @@ static void update_samples(struct tu_subpass *subpass, #define GMEM_ALIGN 0x4000 static void -compute_gmem_offsets(struct tu_render_pass *pass, uint32_t gmem_size) +compute_gmem_offsets(struct tu_render_pass *pass, + const struct tu_physical_device *phys_dev) { /* calculate total bytes per pixel */ uint32_t cpp_total = 0; @@ -56,12 +57,14 @@ compute_gmem_offsets(struct tu_render_pass *pass, uint32_t gmem_size) return; } - /* TODO: this algorithm isn't optimal + /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path + * doesn't break things. maybe there is a better solution? + * TODO: this algorithm isn't optimal * for example, two attachments with cpp = {1, 4} * result: nblocks = {12, 52}, pixels = 196608 * optimal: nblocks = {13, 51}, pixels = 208896 */ - uint32_t gmem_blocks = gmem_size / GMEM_ALIGN; + uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / GMEM_ALIGN; uint32_t offset = 0, pixels = ~0u; for (uint32_t i = 0; i < pass->attachment_count; i++) { struct tu_render_pass_attachment *att = &pass->attachments[i]; @@ -206,7 +209,7 @@ tu_CreateRenderPass(VkDevice _device, *pRenderPass = tu_render_pass_to_handle(pass); - compute_gmem_offsets(pass, device->physical_device->gmem_size); + compute_gmem_offsets(pass, device->physical_device); return VK_SUCCESS; } @@ -335,7 +338,7 @@ tu_CreateRenderPass2(VkDevice _device, *pRenderPass = tu_render_pass_to_handle(pass); - compute_gmem_offsets(pass, device->physical_device->gmem_size); + compute_gmem_offsets(pass, device->physical_device); return VK_SUCCESS; } diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 50365371dad..aecc294b79f 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -77,6 +77,8 @@ typedef uint32_t xcb_window_t; #include "tu_entrypoints.h" +#include "vk_format.h" + #define MAX_VBS 32 #define MAX_VERTEX_ATTRIBS 32 #define MAX_RTS 8 @@ -1284,6 +1286,48 @@ tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back); void tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]); +void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples); + +void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); + +void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); + +struct tu_image_view; + +void +tu_resolve_sysmem(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image_view *src, + struct tu_image_view *dst, + uint32_t layers, + const VkRect2D *rect); + +void +tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + const VkRenderPassBeginInfo *info); + +void +tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + const VkRenderPassBeginInfo *info); + +void +tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); + +/* expose this function to be able to emit load without checking LOAD_OP */ +void +tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); + +/* note: gmem store can also resolve */ +void +tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + uint32_t gmem_a); + struct tu_userdata_info * tu_lookup_user_sgpr(struct tu_pipeline *pipeline, gl_shader_stage stage, @@ -1330,18 +1374,6 @@ tu6_base_format(VkFormat format) return tu6_format_color(format, TILE6_LINEAR).fmt; } -void -tu_pack_clear_value(const VkClearValue *val, - VkFormat format, - uint32_t buf[4]); - -void -tu_2d_clear_color(const VkClearColorValue *val, VkFormat format, uint32_t buf[4]); - -void -tu_2d_clear_zs(const VkClearDepthStencilValue *val, VkFormat format, uint32_t buf[4]); - -enum a6xx_2d_ifmt tu6_fmt_to_ifmt(enum a6xx_format fmt); enum a6xx_depth_format tu6_pipe2depth(VkFormat format); struct tu_image @@ -1409,6 +1441,14 @@ tu_image_stride(struct tu_image *image, int level) return image->layout.slices[level].pitch * image->layout.cpp; } +/* to get the right pitch for compressed formats */ +static inline uint32_t +tu_image_pitch(struct tu_image *image, int level) +{ + uint32_t stride = tu_image_stride(image, level); + return stride / vk_format_get_blockwidth(image->vk_format); +} + static inline uint64_t tu_image_base(struct tu_image *image, int level, int layer) { @@ -1458,10 +1498,16 @@ tu_image_ubwc_base(struct tu_image *image, int level, int layer) #define tu_image_view_ubwc_base_ref(iview) \ tu_image_ubwc_base_ref(iview->image, iview->base_mip, iview->base_layer) +#define tu_image_view_ubwc_pitches(iview) \ + .pitch = tu_image_ubwc_pitch(iview->image, iview->base_mip), \ + .array_pitch = tu_image_ubwc_size(iview->image, iview->base_mip) >> 2 + enum a6xx_tile_mode tu6_get_image_tile_mode(struct tu_image *image, int level); enum a3xx_msaa_samples tu_msaa_samples(uint32_t samples); +enum a6xx_tex_fetchsize +tu6_fetchsize(VkFormat format); static inline struct tu_native_format tu6_format_image(struct tu_image *image, VkFormat format, uint32_t level) @@ -1705,21 +1751,6 @@ tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle); uint64_t tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle); - -void -tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t attachment, - const VkClearValue *value, - const VkClearRect *rect); - -void -tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t attachment, - uint8_t component_mask, - const VkClearValue *value); - #define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType) \ \ static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \ -- 2.30.2