From 0e7b7c3087026015b293e224f024dc4ba7aecb84 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 18 Jun 2020 18:08:58 -0400 Subject: [PATCH] turnip: vsc improvements * Remove scratch_bo from cmdbuffer, use a device-global bo instead, which also includes border color (and eventually shaders for 3D blit path) * Use CP_SET_BIN_DATA5_OFFSET to allow setting VSC buffer addresses only once at the start of the cmdstream * Use scratch bo mechanism for a resizable VSC buffer * Use feedback from "vsc_draw_overflow" and "vsc_prim_overflow" values to increase the size of VSC buffer when beginning to record a new cmdbuffer Signed-off-by: Jonathan Marek Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 125 ++++++++++++++------------- src/freedreno/vulkan/tu_device.c | 46 +++------- src/freedreno/vulkan/tu_private.h | 63 +++++++------- 3 files changed, 109 insertions(+), 125 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index dac6ef82a87..4effbba572a 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -130,7 +130,7 @@ tu6_emit_event_write(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1); tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event)); if (need_seqno) { - tu_cs_emit_qw(cs, cmd->scratch_bo.iova); + tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy)); tu_cs_emit(cs, 0); } } @@ -598,12 +598,12 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); tu_cs_emit(cs, 0x0); - tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7); + tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4); tu_cs_emit(cs, fb->pipe_sizes[pipe] | CP_SET_BIN_DATA5_0_VSC_N(slot)); - tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + pipe * cmd->vsc_draw_strm_pitch); - tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + pipe * 4 + 32 * cmd->vsc_draw_strm_pitch); - tu_cs_emit_qw(cs, cmd->vsc_prim_strm.iova + pipe * cmd->vsc_prim_strm_pitch); + tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch); + tu_cs_emit(cs, pipe * 4); + tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch); tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); tu_cs_emit(cs, 0x0); @@ -714,7 +714,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) static void tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - const struct tu_physical_device *phys_dev = cmd->device->physical_device; + struct tu_device *dev = cmd->device; + const struct tu_physical_device *phys_dev = dev->physical_device; tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); @@ -827,9 +828,52 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) A6XX_RB_LRZ_CNTL(0)); tu_cs_emit_regs(cs, - A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color)); + A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo, + .bo_offset = gb_offset(border_color))); tu_cs_emit_regs(cs, - A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color)); + A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo, + .bo_offset = gb_offset(border_color))); + + /* VSC buffers: + * use vsc pitches from the largest values used so far with this device + * if there hasn't been overflow, there will already be a scratch bo + * allocated for these sizes + * + * if overflow is detected, the stream size is increased by 2x + */ + mtx_lock(&dev->vsc_pitch_mtx); + + struct tu6_global *global = dev->global_bo.map; + + uint32_t vsc_draw_overflow = global->vsc_draw_overflow; + uint32_t vsc_prim_overflow = global->vsc_prim_overflow; + + if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch) + dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD; + + if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch) + dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD; + + cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch; + cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch; + + mtx_unlock(&dev->vsc_pitch_mtx); + + struct tu_bo *vsc_bo; + uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES + + cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES; + + tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo); + + tu_cs_emit_regs(cs, + A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0)); + tu_cs_emit_regs(cs, + A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo)); + tu_cs_emit_regs(cs, + A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo, + .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES)); + + tu_bo_list_add(&cmd->bo_list, vsc_bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); tu_cs_sanity_check(cs); } @@ -841,9 +885,7 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_regs(cs, A6XX_VSC_BIN_SIZE(.width = fb->tile0.width, - .height = fb->tile0.height), - A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = &cmd->vsc_draw_strm, - .bo_offset = 32 * cmd->vsc_draw_strm_pitch)); + .height = fb->tile0.height)); tu_cs_emit_regs(cs, A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width, @@ -853,14 +895,12 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_array(cs, fb->pipe_config, 32); tu_cs_emit_regs(cs, - A6XX_VSC_PRIM_STRM_ADDRESS(.bo = &cmd->vsc_prim_strm), A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch), - A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - 64)); + A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD)); tu_cs_emit_regs(cs, - A6XX_VSC_DRAW_STRM_ADDRESS(.bo = &cmd->vsc_draw_strm), A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch), - A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - 64)); + A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD)); } static void @@ -870,32 +910,26 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) const uint32_t used_pipe_count = fb->pipe_count.width * fb->pipe_count.height; - /* Clear vsc_scratch: */ - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch)); - tu_cs_emit(cs, 0x0); - - /* Check for overflow, write vsc_scratch if detected: */ for (int i = 0; i < used_pipe_count; i++) { tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | CP_COND_WRITE5_0_WRITE_MEMORY); tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i))); tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); - tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - 64)); + tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD)); tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); - tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch)); - tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_draw_strm_pitch)); + tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow)); + tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch)); tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | CP_COND_WRITE5_0_WRITE_MEMORY); tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i))); tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); - tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - 64)); + tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD)); tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); - tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch)); - tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_prim_strm_pitch)); + tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow)); + tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch)); } tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); @@ -1241,9 +1275,6 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd) { const struct tu_framebuffer *fb = cmd->state.framebuffer; - if (use_hw_binning(cmd)) - cmd->use_vsc_data = true; - tu6_tile_render_begin(cmd, &cmd->cs); uint32_t pipe = 0; @@ -1334,28 +1365,12 @@ tu_create_cmd_buffer(struct tu_device *device, list_inithead(&cmd_buffer->upload.list); - VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000); - if (result != VK_SUCCESS) - goto fail_scratch_bo; - - /* TODO: resize on overflow */ - cmd_buffer->vsc_draw_strm_pitch = device->vsc_draw_strm_pitch; - cmd_buffer->vsc_prim_strm_pitch = device->vsc_prim_strm_pitch; - cmd_buffer->vsc_draw_strm = device->vsc_draw_strm; - cmd_buffer->vsc_prim_strm = device->vsc_prim_strm; - return VK_SUCCESS; - -fail_scratch_bo: - list_del(&cmd_buffer->pool_link); - return result; } static void tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) { - tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo); - list_del(&cmd_buffer->pool_link); tu_cs_finish(&cmd_buffer->cs); @@ -1839,7 +1854,7 @@ void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) { /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); - tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[i])); + tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i])); tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i); } @@ -1861,7 +1876,7 @@ void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, 0x40000 | /* ??? */ CP_MEM_TO_REG_0_UNK31 | CP_MEM_TO_REG_0_CNT(1)); - tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[idx])); + tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx])); if (offset) { tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); @@ -1933,18 +1948,8 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer) tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs); } - tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo, - MSM_SUBMIT_BO_WRITE); - - if (cmd_buffer->use_vsc_data) { - tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_draw_strm, - MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); - tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_prim_strm, - MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); - } - - tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->border_color, - MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->global_bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) { tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i], diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 716c168d2e4..c8f7f134a2c 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1182,7 +1182,6 @@ struct PACKED bcolor_entry { }, }; - VkResult tu_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, @@ -1265,30 +1264,20 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, if (!device->compiler) goto fail_queues; -#define VSC_DRAW_STRM_SIZE(pitch) ((pitch) * 32 + 0x100) /* extra size to store VSC_SIZE */ -#define VSC_PRIM_STRM_SIZE(pitch) ((pitch) * 32) - - device->vsc_draw_strm_pitch = 0x440 * 4; - device->vsc_prim_strm_pitch = 0x1040 * 4; - - result = tu_bo_init_new(device, &device->vsc_draw_strm, VSC_DRAW_STRM_SIZE(device->vsc_draw_strm_pitch)); - if (result != VK_SUCCESS) - goto fail_vsc_data; - - result = tu_bo_init_new(device, &device->vsc_prim_strm, VSC_PRIM_STRM_SIZE(device->vsc_prim_strm_pitch)); - if (result != VK_SUCCESS) - goto fail_vsc_data2; + /* initial sizes, these will increase if there is overflow */ + device->vsc_draw_strm_pitch = 0x1000 + VSC_PAD; + device->vsc_prim_strm_pitch = 0x4000 + VSC_PAD; - STATIC_ASSERT(sizeof(struct bcolor_entry) == 128); - result = tu_bo_init_new(device, &device->border_color, sizeof(border_color)); + STATIC_ASSERT(sizeof(border_color) == sizeof(((struct tu6_global*) 0)->border_color)); + result = tu_bo_init_new(device, &device->global_bo, sizeof(struct tu6_global)); if (result != VK_SUCCESS) - goto fail_border_color; + goto fail_global_bo; - result = tu_bo_map(device, &device->border_color); + result = tu_bo_map(device, &device->global_bo); if (result != VK_SUCCESS) - goto fail_border_color_map; + goto fail_global_bo_map; - memcpy(device->border_color.map, border_color, sizeof(border_color)); + memcpy(device->global_bo.map + gb_offset(border_color), border_color, sizeof(border_color)); VkPipelineCacheCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; @@ -1307,20 +1296,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain); + mtx_init(&device->vsc_pitch_mtx, mtx_plain); + *pDevice = tu_device_to_handle(device); return VK_SUCCESS; fail_pipeline_cache: -fail_border_color_map: - tu_bo_finish(device, &device->border_color); - -fail_border_color: - tu_bo_finish(device, &device->vsc_prim_strm); +fail_global_bo_map: + tu_bo_finish(device, &device->global_bo); -fail_vsc_data2: - tu_bo_finish(device, &device->vsc_draw_strm); - -fail_vsc_data: +fail_global_bo: ralloc_free(device->compiler); fail_queues: @@ -1343,9 +1328,6 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) if (!device) return; - tu_bo_finish(device, &device->vsc_draw_strm); - tu_bo_finish(device, &device->vsc_prim_strm); - for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) tu_queue_finish(&device->queues[i][q]); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 12e5b0739a6..3287edf8a22 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -339,6 +339,31 @@ struct tu_bo void *map; }; +/* This struct defines the layout of the global_bo */ +struct tu6_global +{ + /* 6 bcolor_entry entries, one for each VK_BORDER_COLOR */ + uint8_t border_color[128 * 6]; + + uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */ + uint32_t _pad0; + volatile uint32_t vsc_draw_overflow; + uint32_t _pad1; + volatile uint32_t vsc_prim_overflow; + uint32_t _pad2[3]; + + /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ + struct { + uint32_t offset; + uint32_t pad[7]; + } flush_base[4]; +}; +#define gb_offset(member) offsetof(struct tu6_global, member) +#define global_iova(cmd, member) ((cmd)->device->global_bo.iova + gb_offset(member)) + +/* extra space in vsc draw/prim streams */ +#define VSC_PAD 0x40 + struct tu_device { VK_LOADER_DATA _loader_data; @@ -358,11 +383,6 @@ struct tu_device /* Backup in-memory cache to be used if the app doesn't provide one */ struct tu_pipeline_cache *mem_cache; - struct tu_bo vsc_draw_strm; - struct tu_bo vsc_prim_strm; - uint32_t vsc_draw_strm_pitch; - uint32_t vsc_prim_strm_pitch; - #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */ /* Currently the kernel driver uses a 32-bit GPU address space, but it @@ -374,9 +394,13 @@ struct tu_device bool initialized; } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2]; - struct tu_bo border_color; + struct tu_bo global_bo; struct tu_device_extension_table enabled_extensions; + + uint32_t vsc_draw_strm_pitch; + uint32_t vsc_prim_strm_pitch; + mtx_t vsc_pitch_mtx; }; VkResult _tu_device_set_lost(struct tu_device *device, @@ -883,28 +907,6 @@ tu_bo_list_add(struct tu_bo_list *list, VkResult tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other); -/* This struct defines the layout of the scratch_bo */ -struct tu6_control -{ - uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */ - uint32_t _pad0; - volatile uint32_t vsc_overflow; - uint32_t _pad1; - /* flag set from cmdstream when VSC overflow detected: */ - uint32_t vsc_scratch; - uint32_t _pad2; - uint32_t _pad3; - uint32_t _pad4; - - /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ - struct { - uint32_t offset; - uint32_t pad[7]; - } flush_base[4]; -}; - -#define ctrl_offset(member) offsetof(struct tu6_control, member) - struct tu_cmd_buffer { VK_LOADER_DATA _loader_data; @@ -939,15 +941,10 @@ struct tu_cmd_buffer struct tu_cs draw_epilogue_cs; struct tu_cs sub_cs; - struct tu_bo scratch_bo; - bool has_tess; - struct tu_bo vsc_draw_strm; - struct tu_bo vsc_prim_strm; uint32_t vsc_draw_strm_pitch; uint32_t vsc_prim_strm_pitch; - bool use_vsc_data; }; /* Temporary struct for tracking a register state to be written, used by -- 2.30.2