+ radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+ radeon_emit(cs, config->rsrc1);
+ radeon_emit(cs, config->rsrc2);
+
+ COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x "
+ "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2);
+
+ radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+ S_00B860_WAVES(sctx->scratch_waves)
+ | S_00B860_WAVESIZE(config->scratch_bytes_per_wave >> 10));
+
+ sctx->cs_shader_state.emitted_program = program;
+ sctx->cs_shader_state.offset = offset;
+ sctx->cs_shader_state.uses_scratch =
+ config->scratch_bytes_per_wave != 0;
+
+ return true;
+}
+
+static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
+ const amd_kernel_code_t *code_object,
+ unsigned user_sgpr)
+{
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+ unsigned max_private_element_size = AMD_HSA_BITS_GET(
+ code_object->code_properties,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
+
+ uint32_t scratch_dword0 = scratch_va & 0xffffffff;
+ uint32_t scratch_dword1 =
+ S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+ S_008F04_SWIZZLE_ENABLE(1);
+
+ /* Disable address clamping */
+ uint32_t scratch_dword2 = 0xffffffff;
+ uint32_t scratch_dword3 =
+ S_008F0C_INDEX_STRIDE(3) |
+ S_008F0C_ADD_TID_ENABLE(1);
+
+ if (sctx->b.chip_class >= GFX9) {
+ assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */
+ } else {
+ scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
+
+ if (sctx->b.chip_class < VI) {
+ /* BUF_DATA_FORMAT is ignored, but it cannot be
+ * BUF_DATA_FORMAT_INVALID. */
+ scratch_dword3 |=
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
+ }
+ }
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 4);
+ radeon_emit(cs, scratch_dword0);
+ radeon_emit(cs, scratch_dword1);
+ radeon_emit(cs, scratch_dword2);
+ radeon_emit(cs, scratch_dword3);
+}
+
+static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
+ const amd_kernel_code_t *code_object,
+ const struct pipe_grid_info *info,
+ uint64_t kernel_args_va)
+{
+ struct si_compute *program = sctx->cs_shader_state.program;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+
+ static const enum amd_code_property_mask_t workgroup_count_masks [] = {
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
+ };
+
+ unsigned i, user_sgpr = 0;
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+ if (code_object->workitem_private_segment_byte_size > 0) {
+ setup_scratch_rsrc_user_sgprs(sctx, code_object,
+ user_sgpr);
+ }
+ user_sgpr += 4;
+ }
+
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
+ struct dispatch_packet dispatch;
+ unsigned dispatch_offset;
+ struct r600_resource *dispatch_buf = NULL;
+ uint64_t dispatch_va;
+
+ /* Upload dispatch ptr */
+ memset(&dispatch, 0, sizeof(dispatch));
+
+ dispatch.workgroup_size_x = info->block[0];
+ dispatch.workgroup_size_y = info->block[1];
+ dispatch.workgroup_size_z = info->block[2];
+
+ dispatch.grid_size_x = info->grid[0] * info->block[0];
+ dispatch.grid_size_y = info->grid[1] * info->block[1];
+ dispatch.grid_size_z = info->grid[2] * info->block[2];
+
+ dispatch.private_segment_size = program->private_size;
+ dispatch.group_segment_size = program->local_size;
+
+ dispatch.kernarg_address = kernel_args_va;
+
+ u_upload_data(sctx->b.b.const_uploader, 0, sizeof(dispatch),
+ 256, &dispatch, &dispatch_offset,
+ (struct pipe_resource**)&dispatch_buf);
+
+ if (!dispatch_buf) {
+ fprintf(stderr, "Error: Failed to allocate dispatch "
+ "packet.");
+ }
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf,
+ RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+
+ dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 2);
+ radeon_emit(cs, dispatch_va);
+ radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
+ S_008F04_STRIDE(0));
+
+ r600_resource_reference(&dispatch_buf, NULL);
+ user_sgpr += 2;
+ }
+
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 2);
+ radeon_emit(cs, kernel_args_va);
+ radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
+ S_008F04_STRIDE(0));
+ user_sgpr += 2;
+ }
+
+ for (i = 0; i < 3 && user_sgpr < 16; i++) {
+ if (code_object->code_properties & workgroup_count_masks[i]) {
+ radeon_set_sh_reg_seq(cs,
+ R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 1);
+ radeon_emit(cs, info->grid[i]);
+ user_sgpr += 1;
+ }
+ }
+}
+
+static bool si_upload_compute_input(struct si_context *sctx,
+ const amd_kernel_code_t *code_object,
+ const struct pipe_grid_info *info)
+{
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ struct si_compute *program = sctx->cs_shader_state.program;
+ struct r600_resource *input_buffer = NULL;
+ unsigned kernel_args_size;
+ unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36;
+ uint32_t kernel_args_offset = 0;
+ uint32_t *kernel_args;
+ void *kernel_args_ptr;
+ uint64_t kernel_args_va;
+ unsigned i;