From e7b2719f69a77b9daad5c17d651b54d98720669f Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 10 Nov 2017 12:53:13 -0500 Subject: [PATCH] freedreno/a5xx: indirect grids Signed-off-by: Rob Clark --- .../drivers/freedreno/a5xx/fd5_compute.c | 39 ++++++++---- .../drivers/freedreno/ir3/ir3_shader.c | 62 ++++++++++++++++--- .../drivers/freedreno/ir3/ir3_shader.h | 5 ++ 3 files changed, 86 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c index 2efcece41a5..362ab1dc544 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c @@ -26,6 +26,8 @@ #include "pipe/p_state.h" +#include "freedreno_resource.h" + #include "fd5_compute.h" #include "fd5_context.h" #include "fd5_emit.h" @@ -110,9 +112,9 @@ cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v) OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL_0, 2); OUT_RING(ring, A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | - A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) | - A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) | - A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) | + A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) | + A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); OUT_RING(ring, 0x1); /* HLSQ_CS_CNTL_1 */ fd5_emit_shader(ring, v); @@ -126,9 +128,6 @@ fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) struct ir3_shader_variant *v; struct fd_ringbuffer *ring = ctx->batch->draw; - if (info->indirect) - return; // TODO - v = ir3_shader_variant(so->shader, key, &ctx->debug); if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG) @@ -158,11 +157,29 @@ fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ - OUT_PKT7(ring, CP_EXEC_CS, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0])); - OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1])); - OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2])); + if (info->indirect) { + struct fd_resource *rsc = fd_resource(info->indirect); + + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, CACHE_FLUSH_TS); + OUT_RELOCW(ring, fd5_context(ctx)->blit_mem, 0, 0, 0); /* ADDR_LO/HI */ + OUT_RING(ring, 0x00000000); + + OUT_WFI5(ring); + + OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */ + OUT_RING(ring, CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | + CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | + CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); + } else { + OUT_PKT7(ring, CP_EXEC_CS, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0])); + OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1])); + OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2])); + } } void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 61a336ed7dd..3337543113d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -859,15 +859,59 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin /* emit compute-shader driver-params: */ uint32_t offset = v->constbase.driver_param; if (v->constlen > offset) { - uint32_t compute_params[IR3_DP_CS_COUNT] = { - [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0], - [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1], - [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2], - /* do we need work-group-size? */ - }; - fd_wfi(ctx->batch, ring); - ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0, - ARRAY_SIZE(compute_params), compute_params, NULL); + + if (info->indirect) { + struct pipe_resource *indirect = NULL; + unsigned indirect_offset; + + /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs + * to be aligned more strongly than 4 bytes. So in this case + * we need a temporary buffer to copy NumWorkGroups.xyz to. + * + * TODO if previous compute job is writing to info->indirect, + * we might need a WFI.. but since we currently flush for each + * compute job, we are probably ok for now. + */ + if (info->indirect_offset & 0xf) { + indirect = pipe_buffer_create(&ctx->screen->base, + PIPE_BIND_COMMAND_ARGS_BUFFER, PIPE_USAGE_STREAM, + 0x1000); + indirect_offset = 0; + + if (is_a5xx(ctx->screen)) { + struct fd_bo *src = fd_resource(info->indirect)->bo; + struct fd_bo *dst = fd_resource(indirect)->bo; + for (unsigned i = 0; i < 3; i++) { + unsigned dst_off = i * 4; + unsigned src_off = (i * 4) + info->indirect_offset; + OUT_PKT7(ring, CP_MEM_TO_MEM, 5); + OUT_RING(ring, 0x00000000); + OUT_RELOCW(ring, dst, dst_off, 0, 0); + OUT_RELOC (ring, src, src_off, 0, 0); + } + } else { + assert(0); + } + } else { + pipe_resource_reference(&indirect, info->indirect); + indirect_offset = info->indirect_offset; + } + + ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, + indirect_offset, 4, NULL, indirect); + + pipe_resource_reference(&indirect, NULL); + } else { + uint32_t compute_params[IR3_DP_CS_COUNT] = { + [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0], + [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1], + [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2], + /* do we need work-group-size? */ + }; + + ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0, + ARRAY_SIZE(compute_params), compute_params, NULL); + } } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 3886cce5571..040ea4cd9d8 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -44,6 +44,11 @@ enum ir3_driver_param { IR3_DP_NUM_WORK_GROUPS_X = 0, IR3_DP_NUM_WORK_GROUPS_Y = 1, IR3_DP_NUM_WORK_GROUPS_Z = 2, + /* NOTE: gl_NumWorkGroups should be vec4 aligned because + * glDispatchComputeIndirect() needs to load these from + * the info->indirect buffer. Keep that in mind when/if + * adding any addition CS driver params. + */ IR3_DP_CS_COUNT = 4, /* must be aligned to vec4 */ /* vertex shader driver params: */ -- 2.30.2