#include "pipe/p_state.h"
+#include "freedreno_resource.h"
+
#include "fd5_compute.h"
#include "fd5_context.h"
#include "fd5_emit.h"
OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL_0, 2);
OUT_RING(ring, A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
- A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
- A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
- A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+ A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
+ A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
+ A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, 0x1); /* HLSQ_CS_CNTL_1 */
fd5_emit_shader(ring, v);
struct ir3_shader_variant *v;
struct fd_ringbuffer *ring = ctx->batch->draw;
- if (info->indirect)
- return; // TODO
-
v = ir3_shader_variant(so->shader, key, &ctx->debug);
if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG)
OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */
OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */
- OUT_PKT7(ring, CP_EXEC_CS, 4);
- OUT_RING(ring, 0x00000000);
- OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0]));
- OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1]));
- OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
+ if (info->indirect) {
+ struct fd_resource *rsc = fd_resource(info->indirect);
+
+ OUT_PKT7(ring, CP_EVENT_WRITE, 4);
+ OUT_RING(ring, CACHE_FLUSH_TS);
+ OUT_RELOCW(ring, fd5_context(ctx)->blit_mem, 0, 0, 0); /* ADDR_LO/HI */
+ OUT_RING(ring, 0x00000000);
+
+ OUT_WFI5(ring);
+
+ OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4);
+ OUT_RING(ring, 0x00000000);
+ OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */
+ OUT_RING(ring, CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
+ CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
+ CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
+ } else {
+ OUT_PKT7(ring, CP_EXEC_CS, 4);
+ OUT_RING(ring, 0x00000000);
+ OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0]));
+ OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1]));
+ OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
+ }
}
void
/* emit compute-shader driver-params: */
uint32_t offset = v->constbase.driver_param;
if (v->constlen > offset) {
- uint32_t compute_params[IR3_DP_CS_COUNT] = {
- [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
- [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
- [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
- /* do we need work-group-size? */
- };
-
fd_wfi(ctx->batch, ring);
- ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0,
- ARRAY_SIZE(compute_params), compute_params, NULL);
+
+ if (info->indirect) {
+ struct pipe_resource *indirect = NULL;
+ unsigned indirect_offset;
+
+ /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs
+ * to be aligned more strongly than 4 bytes. So in this case
+ * we need a temporary buffer to copy NumWorkGroups.xyz to.
+ *
+ * TODO if previous compute job is writing to info->indirect,
+ * we might need a WFI.. but since we currently flush for each
+ * compute job, we are probably ok for now.
+ */
+ if (info->indirect_offset & 0xf) {
+ indirect = pipe_buffer_create(&ctx->screen->base,
+ PIPE_BIND_COMMAND_ARGS_BUFFER, PIPE_USAGE_STREAM,
+ 0x1000);
+ indirect_offset = 0;
+
+ if (is_a5xx(ctx->screen)) {
+ struct fd_bo *src = fd_resource(info->indirect)->bo;
+ struct fd_bo *dst = fd_resource(indirect)->bo;
+ for (unsigned i = 0; i < 3; i++) {
+ unsigned dst_off = i * 4;
+ unsigned src_off = (i * 4) + info->indirect_offset;
+ OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
+ OUT_RING(ring, 0x00000000);
+ OUT_RELOCW(ring, dst, dst_off, 0, 0);
+ OUT_RELOC (ring, src, src_off, 0, 0);
+ }
+ } else {
+ assert(0);
+ }
+ } else {
+ pipe_resource_reference(&indirect, info->indirect);
+ indirect_offset = info->indirect_offset;
+ }
+
+ ctx->emit_const(ring, SHADER_COMPUTE, offset * 4,
+ indirect_offset, 4, NULL, indirect);
+
+ pipe_resource_reference(&indirect, NULL);
+ } else {
+ uint32_t compute_params[IR3_DP_CS_COUNT] = {
+ [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
+ [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
+ [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
+ /* do we need work-group-size? */
+ };
+
+ ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0,
+ ARRAY_SIZE(compute_params), compute_params, NULL);
+ }
}
}
IR3_DP_NUM_WORK_GROUPS_X = 0,
IR3_DP_NUM_WORK_GROUPS_Y = 1,
IR3_DP_NUM_WORK_GROUPS_Z = 2,
+ /* NOTE: gl_NumWorkGroups should be vec4 aligned because
+ * glDispatchComputeIndirect() needs to load these from
+ * the info->indirect buffer. Keep that in mind when/if
+ * adding any addition CS driver params.
+ */
IR3_DP_CS_COUNT = 4, /* must be aligned to vec4 */
/* vertex shader driver params: */