+radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
+ const struct radv_dispatch_info *info)
+{
+ struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+ struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
+ struct radeon_winsys *ws = cmd_buffer->device->ws;
+ struct radeon_winsys_cs *cs = cmd_buffer->cs;
+ struct ac_userdata_info *loc;
+ uint8_t grid_used;
+
+ grid_used = compute_shader->info.info.cs.grid_components_used;
+
+ loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
+ AC_UD_CS_GRID_SIZE);
+
+ MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25);
+
+ if (info->indirect) {
+ uint64_t va = radv_buffer_get_va(info->indirect->bo);
+
+ va += info->indirect->offset + info->indirect_offset;
+
+ ws->cs_add_buffer(cs, info->indirect->bo, 8);
+
+ if (loc->sgpr_idx != -1) {
+ for (unsigned i = 0; i < grid_used; ++i) {
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+ COPY_DATA_DST_SEL(COPY_DATA_REG));
+ radeon_emit(cs, (va + 4 * i));
+ radeon_emit(cs, (va + 4 * i) >> 32);
+ radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
+ + loc->sgpr_idx * 4) >> 2) + i);
+ radeon_emit(cs, 0);
+ }
+ }
+
+ if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, 1);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, 1);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, 0);
+ radeon_emit(cs, 1);
+ }
+ } else {
+ unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
+ unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
+
+ if (info->unaligned) {
+ unsigned *cs_block_size = compute_shader->info.cs.block_size;
+ unsigned remainder[3];
+
+ /* If aligned, these should be an entire block size,
+ * not 0.
+ */
+ remainder[0] = blocks[0] + cs_block_size[0] -
+ align_u32_npot(blocks[0], cs_block_size[0]);
+ remainder[1] = blocks[1] + cs_block_size[1] -
+ align_u32_npot(blocks[1], cs_block_size[1]);
+ remainder[2] = blocks[2] + cs_block_size[2] -
+ align_u32_npot(blocks[2], cs_block_size[2]);
+
+ blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
+ blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
+ blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
+
+ radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+ radeon_emit(cs,
+ S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
+ S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
+ radeon_emit(cs,
+ S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
+ S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
+ radeon_emit(cs,
+ S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
+ S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
+
+ dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+ }
+
+ if (loc->sgpr_idx != -1) {
+ assert(!loc->indirect);
+ assert(loc->num_sgprs == grid_used);
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ loc->sgpr_idx * 4, grid_used);
+ radeon_emit(cs, blocks[0]);
+ if (grid_used > 1)
+ radeon_emit(cs, blocks[1]);
+ if (grid_used > 2)
+ radeon_emit(cs, blocks[2]);
+ }
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, blocks[0]);
+ radeon_emit(cs, blocks[1]);
+ radeon_emit(cs, blocks[2]);
+ radeon_emit(cs, dispatch_initiator);
+ }
+
+ assert(cmd_buffer->cs->cdw <= cdw_max);
+}
+
+static void
+radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
+ const struct radv_dispatch_info *info)