From c3efeac4c68e158722478772f73394aa6292d9a4 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 26 Nov 2019 20:37:19 -0800 Subject: [PATCH] turnip: Add support for compute shaders. Since compute shares the FS state with graphics, we have to re-upload the pipeline state when switching between compute dispatch and graphics draws. We could potentially expose graphics and compute as separate queues and then we wouldn't need pipeline state management, but the closed driver exposes a single queue and consistency with them is probably good. So far I'm emitting texture/ibo state as IBs that we jump to. This is kind of silly when we could just emit it directly in our CS, but that's a refactor we can do later. Reviewed-by: Jonathan Marek --- src/freedreno/vulkan/tu_cmd_buffer.c | 168 ++++++++++++++++++++++++- src/freedreno/vulkan/tu_pipeline.c | 181 ++++++++++++++++++++++++--- src/freedreno/vulkan/tu_private.h | 11 +- 3 files changed, 332 insertions(+), 28 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index a32495103f1..5c6eeb2c50f 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1809,7 +1809,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE; break; case VK_PIPELINE_BIND_POINT_COMPUTE: - tu_finishme("binding compute pipeline"); + cmd->state.compute_pipeline = pipeline; + cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE; break; default: unreachable("unrecognized pipeline bind point"); @@ -2557,13 +2558,17 @@ tu6_emit_ibo(struct tu_device *device, struct tu_cs *draw_state, /* emit texture state: */ tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6, 3); tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_TYPE(type == MESA_SHADER_COMPUTE ? + ST6_IBO : ST6_SHADER) | CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) | + CP_LOAD_STATE6_0_STATE_BLOCK(type == MESA_SHADER_COMPUTE ? + SB6_CS_SHADER : SB6_IBO) | CP_LOAD_STATE6_0_NUM_UNIT(link->image_mapping.num_ibo)); tu_cs_emit_qw(&cs, ibo_addr); /* SRC_ADDR_LO/HI */ - tu_cs_emit_pkt4(&cs, REG_A6XX_SP_IBO_LO, 2); + tu_cs_emit_pkt4(&cs, + type == MESA_SHADER_COMPUTE ? + REG_A6XX_SP_IBO_LO : REG_A6XX_SP_CS_IBO_LO, 2); tu_cs_emit_qw(&cs, ibo_addr); /* SRC_ADDR_LO/HI */ return tu_cs_end_sub_stream(draw_state, &cs); @@ -2806,7 +2811,11 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, } } } - cmd->state.dirty = 0; + + /* Fragment shader state overwrites compute shader state, so flag the + * compute pipeline for re-emit. + */ + cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE; } static void @@ -2989,9 +2998,156 @@ struct tu_dispatch_info }; static void -tu_dispatch(struct tu_cmd_buffer *cmd_buffer, +tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline, + const struct tu_dispatch_info *info) +{ + gl_shader_stage type = MESA_SHADER_COMPUTE; + const struct tu_program_descriptor_linkage *link = + &pipeline->program.link[type]; + const struct ir3_const_state *const_state = &link->const_state; + uint32_t offset_dwords = const_state->offsets.driver_param; + + if (link->constlen <= offset_dwords) + return; + + if (!info->indirect) { + uint32_t driver_params[] = { + info->blocks[0], + info->blocks[1], + info->blocks[2], + pipeline->compute.local_size[0], + pipeline->compute.local_size[1], + pipeline->compute.local_size[2], + }; + uint32_t num_consts = MIN2(const_state->num_driver_params, + link->constlen - offset_dwords); + uint32_t align_size = align(num_consts, 4); + + /* push constants */ + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + align_size); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset_dwords / 4) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(align_size / 4)); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + uint32_t i; + for (i = 0; i < num_consts; i++) + tu_cs_emit(cs, driver_params[i]); + for (; i < align_size; i++) + tu_cs_emit(cs, 0); + } else { + tu_finishme("Indirect driver params"); + } +} + +static void +tu_dispatch(struct tu_cmd_buffer *cmd, const struct tu_dispatch_info *info) { + struct tu_cs *cs = &cmd->cs; + struct tu_pipeline *pipeline = cmd->state.compute_pipeline; + struct tu_descriptor_state *descriptors_state = + &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE]; + + VkResult result = tu_cs_reserve_space(cmd->device, cs, 256); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE) + tu_cs_emit_ib(cs, &pipeline->program.state_ib); + + struct tu_cs_entry ib; + + ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE); + if (ib.size) + tu_cs_emit_ib(cs, &ib); + + tu_emit_compute_driver_params(cs, pipeline, info); + + bool needs_border; + ib = tu6_emit_textures(cmd->device, &cmd->draw_state, pipeline, + descriptors_state, MESA_SHADER_COMPUTE, + &needs_border); + if (ib.size) + tu_cs_emit_ib(cs, &ib); + + if (needs_border) + tu6_emit_border_color(cmd, cs); + + ib = tu6_emit_ibo(cmd->device, &cmd->draw_state, pipeline, + descriptors_state, MESA_SHADER_COMPUTE); + if (ib.size) + tu_cs_emit_ib(cs, &ib); + + /* track BOs */ + if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { + unsigned i; + for_each_bit(i, descriptors_state->valid) { + struct tu_descriptor_set *set = descriptors_state->sets[i]; + for (unsigned j = 0; j < set->layout->buffer_count; ++j) + if (set->descriptors[j]) { + tu_bo_list_add(&cmd->bo_list, set->descriptors[j], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + } + } + } + + /* Compute shader state overwrites fragment shader state, so we flag the + * graphics pipeline for re-emit. + */ + cmd->state.dirty = TU_CMD_DIRTY_PIPELINE; + + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x8)); + + const uint32_t *local_size = pipeline->compute.local_size; + const uint32_t *num_groups = info->blocks; + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_NDRANGE_0, 7); + tu_cs_emit(cs, + A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(3) | + A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) | + A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | + A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1)); + tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0])); + tu_cs_emit(cs, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */ + tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1])); + tu_cs_emit(cs, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */ + tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2])); + tu_cs_emit(cs, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */ + + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3); + tu_cs_emit(cs, 1); /* HLSQ_CS_KERNEL_GROUP_X */ + tu_cs_emit(cs, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ + tu_cs_emit(cs, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ + + if (info->indirect) { + uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset; + + tu_bo_list_add(&cmd->bo_list, info->indirect->bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + + tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit_qw(cs, iova); + tu_cs_emit(cs, + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); + } else { + tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0])); + tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1])); + tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2])); + } + + tu_cs_emit_wfi(cs); + + tu6_emit_cache_flush(cmd, cs); } void diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 0e35d4e60dc..976bf60f81f 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -476,6 +476,52 @@ tu6_emit_fs_config(struct tu_cs *cs, const struct ir3_shader_variant *fs) tu_cs_emit(cs, fs->image_mapping.num_ibo); } +static void +tu6_emit_cs_config(struct tu_cs *cs, const struct ir3_shader_variant *v) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 1); + tu_cs_emit(cs, 0xff); + + unsigned constlen = align(v->constlen, 4); + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL, 1); + tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) | + A6XX_HLSQ_CS_CNTL_ENABLED); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONFIG, 2); + tu_cs_emit(cs, A6XX_SP_CS_CONFIG_ENABLED | + A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo) | + A6XX_SP_CS_CONFIG_NTEX(v->num_samp) | + A6XX_SP_CS_CONFIG_NSAMP(v->num_samp) | + A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo)); + tu_cs_emit(cs, v->instrlen); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CTRL_REG0, 1); + tu_cs_emit(cs, A6XX_SP_CS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(v->info.max_reg + 1) | + A6XX_SP_CS_CTRL_REG0_MERGEDREGS | + A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) | + COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE)); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); + tu_cs_emit(cs, 0x41); + + uint32_t local_invocation_id = + ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); + uint32_t work_group_id = + ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID); + + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); + tu_cs_emit(cs, + A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | + A6XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + tu_cs_emit(cs, 0x2fc); /* HLSQ_CS_UNKNOWN_B998 */ + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_IBO_COUNT, 1); + tu_cs_emit(cs, v->image_mapping.num_ibo); +} + static void tu6_emit_vs_system_values(struct tu_cs *cs, const struct ir3_shader_variant *vs) @@ -1441,13 +1487,12 @@ tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]) } static VkResult -tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder, - struct tu_pipeline **out_pipeline) +tu_pipeline_create(struct tu_device *dev, + const VkAllocationCallbacks *pAllocator, + struct tu_pipeline **out_pipeline) { - struct tu_device *dev = builder->device; - struct tu_pipeline *pipeline = - vk_zalloc2(&dev->alloc, builder->alloc, sizeof(*pipeline), 8, + vk_zalloc2(&dev->alloc, pAllocator, sizeof(*pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!pipeline) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1457,7 +1502,7 @@ tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder, /* reserve the space now such that tu_cs_begin_sub_stream never fails */ VkResult result = tu_cs_reserve_space(dev, &pipeline->cs, 2048); if (result != VK_SUCCESS) { - vk_free2(&dev->alloc, builder->alloc, pipeline); + vk_free2(&dev->alloc, pAllocator, pipeline); return result; } @@ -1813,7 +1858,8 @@ static VkResult tu_pipeline_builder_build(struct tu_pipeline_builder *builder, struct tu_pipeline **pipeline) { - VkResult result = tu_pipeline_builder_create_pipeline(builder, pipeline); + VkResult result = tu_pipeline_create(builder->device, builder->alloc, + pipeline); if (result != VK_SUCCESS) return result; @@ -1949,38 +1995,133 @@ tu_CreateGraphicsPipelines(VkDevice device, return final_result; } +static void +tu6_emit_compute_program(struct tu_cs *cs, + struct tu_shader *shader, + const struct tu_bo *binary_bo) +{ + const struct ir3_shader_variant *v = &shader->variants[0]; + + tu6_emit_cs_config(cs, v); + + /* The compute program is the only one in the pipeline, so 0 offset. */ + tu6_emit_shader_object(cs, MESA_SHADER_COMPUTE, v, binary_bo, 0); + + tu6_emit_immediates(cs, v, CP_LOAD_STATE6_FRAG, SB6_CS_SHADER); +} + static VkResult -tu_compute_pipeline_create(VkDevice _device, +tu_compute_upload_shader(VkDevice device, + struct tu_pipeline *pipeline, + struct tu_shader *shader) +{ + TU_FROM_HANDLE(tu_device, dev, device); + struct tu_bo *bo = &pipeline->program.binary_bo; + struct ir3_shader_variant *v = &shader->variants[0]; + + uint32_t shader_size = sizeof(uint32_t) * v->info.sizedwords; + VkResult result = + tu_bo_init_new(dev, bo, shader_size); + if (result != VK_SUCCESS) + return result; + + result = tu_bo_map(dev, bo); + if (result != VK_SUCCESS) + return result; + + memcpy(bo->map, shader->binary, shader_size); + + return VK_SUCCESS; +} + + +static VkResult +tu_compute_pipeline_create(VkDevice device, VkPipelineCache _cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline) { + TU_FROM_HANDLE(tu_device, dev, device); + const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage; + VkResult result; + + struct tu_pipeline *pipeline; + + result = tu_pipeline_create(dev, pAllocator, &pipeline); + if (result != VK_SUCCESS) + return result; + + struct tu_shader_compile_options options; + tu_shader_compile_options_init(&options, NULL); + + struct tu_shader *shader = + tu_shader_create(dev, MESA_SHADER_COMPUTE, stage_info, pAllocator); + if (!shader) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + result = tu_shader_compile(dev, shader, NULL, &options, pAllocator); + if (result != VK_SUCCESS) + return result; + + struct tu_program_descriptor_linkage *link = &pipeline->program.link[MESA_SHADER_COMPUTE]; + struct ir3_shader_variant *v = &shader->variants[0]; + + link->ubo_state = v->shader->ubo_state; + link->const_state = v->shader->const_state; + link->constlen = v->constlen; + link->texture_map = shader->texture_map; + link->sampler_map = shader->sampler_map; + link->ubo_map = shader->ubo_map; + link->ssbo_map = shader->ssbo_map; + link->image_mapping = v->image_mapping; + + result = tu_compute_upload_shader(device, pipeline, shader); + if (result != VK_SUCCESS) + return result; + + for (int i = 0; i < 3; i++) + pipeline->compute.local_size[i] = v->shader->nir->info.cs.local_size[i]; + + struct tu_cs prog_cs; + tu_cs_begin_sub_stream(dev, &pipeline->cs, 512, &prog_cs); + tu6_emit_compute_program(&prog_cs, shader, &pipeline->program.binary_bo); + pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); + + *pPipeline = tu_pipeline_to_handle(pipeline); return VK_SUCCESS; + +fail: + tu_shader_destroy(dev, shader, pAllocator); + if (result != VK_SUCCESS) { + tu_pipeline_finish(pipeline, dev, pAllocator); + vk_free2(&dev->alloc, pAllocator, pipeline); + } + + return result; } VkResult -tu_CreateComputePipelines(VkDevice _device, +tu_CreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { - VkResult result = VK_SUCCESS; + VkResult final_result = VK_SUCCESS; - unsigned i = 0; - for (; i < count; i++) { - VkResult r; - r = tu_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], - pAllocator, &pPipelines[i]); - if (r != VK_SUCCESS) { - result = r; - } - pPipelines[i] = VK_NULL_HANDLE; + for (uint32_t i = 0; i < count; i++) { + VkResult result = tu_compute_pipeline_create(device, pipelineCache, + &pCreateInfos[i], + pAllocator, &pPipelines[i]); + if (result != VK_SUCCESS) + final_result = result; } - return result; + return final_result; } void diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 813ff0cc1ea..2532feed14e 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -825,8 +825,9 @@ struct tu_tiling_config enum tu_cmd_dirty_bits { TU_CMD_DIRTY_PIPELINE = 1 << 0, - TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1, - TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 2, + TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1, + TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2, + TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3, TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16, TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17, @@ -839,6 +840,7 @@ struct tu_cmd_state uint32_t dirty; struct tu_pipeline *pipeline; + struct tu_pipeline *compute_pipeline; /* Vertex buffers */ struct @@ -1167,6 +1169,11 @@ struct tu_pipeline { struct tu_cs_entry state_ib; } blend; + + struct + { + uint32_t local_size[3]; + } compute; }; void -- 2.30.2