+ sctx->cs_shader_state.emitted_program = NULL;
+ sctx->cs_shader_state.initialized = true;
+}
+
+static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
+ struct si_shader *shader,
+ struct si_shader_config *config)
+{
+ uint64_t scratch_bo_size, scratch_needed;
+ scratch_bo_size = 0;
+ scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+ if (sctx->compute_scratch_buffer)
+ scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
+
+ if (scratch_bo_size < scratch_needed) {
+ r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
+
+ sctx->compute_scratch_buffer = (struct r600_resource*)
+ r600_aligned_buffer_create(&sctx->screen->b.b,
+ R600_RESOURCE_FLAG_UNMAPPABLE,
+ PIPE_USAGE_DEFAULT,
+ scratch_needed, 256);
+
+ if (!sctx->compute_scratch_buffer)
+ return false;
+ }
+
+ if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
+ uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+ si_shader_apply_scratch_relocs(sctx, shader, config, scratch_va);
+
+ if (si_shader_binary_upload(sctx->screen, shader))
+ return false;
+
+ r600_resource_reference(&shader->scratch_bo,
+ sctx->compute_scratch_buffer);
+ }
+
+ return true;
+}
+
+static bool si_switch_compute_shader(struct si_context *sctx,
+ struct si_compute *program,
+ struct si_shader *shader,
+ const amd_kernel_code_t *code_object,
+ unsigned offset)
+{
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ struct si_shader_config inline_config = {0};
+ struct si_shader_config *config;
+ uint64_t shader_va;
+
+ if (sctx->cs_shader_state.emitted_program == program &&
+ sctx->cs_shader_state.offset == offset)
+ return true;
+
+ if (program->ir_type == PIPE_SHADER_IR_TGSI) {
+ config = &shader->config;
+ } else {
+ unsigned lds_blocks;
+
+ config = &inline_config;
+ if (code_object) {
+ code_object_to_config(code_object, config);
+ } else {
+ si_shader_binary_read_config(&shader->binary, config, offset);
+ }
+
+ lds_blocks = config->lds_size;
+ /* XXX: We are over allocating LDS. For SI, the shader reports
+ * LDS in blocks of 256 bytes, so if there are 4 bytes lds
+ * allocated in the shader and 4 bytes allocated by the state
+ * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
+ */
+ if (sctx->b.chip_class <= SI) {
+ lds_blocks += align(program->local_size, 256) >> 8;
+ } else {
+ lds_blocks += align(program->local_size, 512) >> 9;
+ }
+
+ /* TODO: use si_multiwave_lds_size_workaround */
+ assert(lds_blocks <= 0xFF);
+
+ config->rsrc2 &= C_00B84C_LDS_SIZE;
+ config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
+ }
+
+ if (!si_setup_compute_scratch_buffer(sctx, shader, config))
+ return false;
+
+ if (shader->scratch_bo) {
+ COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
+ "Total Scratch: %u bytes\n", sctx->scratch_waves,
+ config->scratch_bytes_per_wave,
+ config->scratch_bytes_per_wave *
+ sctx->scratch_waves);
+
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ shader->scratch_bo, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SCRATCH_BUFFER);
+ }
+
+ /* Prefetch the compute shader to TC L2.
+ *
+ * We should also prefetch graphics shaders if a compute dispatch was
+ * the last command, and the compute shader if a draw call was the last
+ * command. However, that would add more complexity and we're likely
+ * to get a shader state change in that case anyway.
+ */
+ if (sctx->b.chip_class >= CIK) {
+ cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b,
+ 0, program->shader.bo->b.b.width0);
+ }
+
+ shader_va = shader->bo->gpu_address + offset;
+ if (program->use_code_object_v2) {
+ /* Shader code is placed after the amd_kernel_code_t
+ * struct. */
+ shader_va += sizeof(amd_kernel_code_t);
+ }
+
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
+ RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+ radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+ radeon_emit(cs, shader_va >> 8);
+ radeon_emit(cs, shader_va >> 40);
+
+ radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+ radeon_emit(cs, config->rsrc1);
+ radeon_emit(cs, config->rsrc2);
+
+ COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x "
+ "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2);
+
+ radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+ S_00B860_WAVES(sctx->scratch_waves)
+ | S_00B860_WAVESIZE(config->scratch_bytes_per_wave >> 10));
+
+ sctx->cs_shader_state.emitted_program = program;
+ sctx->cs_shader_state.offset = offset;
+ sctx->cs_shader_state.uses_scratch =
+ config->scratch_bytes_per_wave != 0;
+
+ return true;
+}
+
+static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
+ const amd_kernel_code_t *code_object,
+ unsigned user_sgpr)
+{
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+ unsigned max_private_element_size = AMD_HSA_BITS_GET(
+ code_object->code_properties,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
+
+ uint32_t scratch_dword0 = scratch_va & 0xffffffff;
+ uint32_t scratch_dword1 =
+ S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+ S_008F04_SWIZZLE_ENABLE(1);
+
+ /* Disable address clamping */
+ uint32_t scratch_dword2 = 0xffffffff;
+ uint32_t scratch_dword3 =
+ S_008F0C_ELEMENT_SIZE(max_private_element_size) |
+ S_008F0C_INDEX_STRIDE(3) |
+ S_008F0C_ADD_TID_ENABLE(1);
+
+
+ if (sctx->screen->b.chip_class < VI) {
+ /* BUF_DATA_FORMAT is ignored, but it cannot be
+ BUF_DATA_FORMAT_INVALID. */
+ scratch_dword3 |=
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
+ }
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 4);
+ radeon_emit(cs, scratch_dword0);
+ radeon_emit(cs, scratch_dword1);
+ radeon_emit(cs, scratch_dword2);
+ radeon_emit(cs, scratch_dword3);
+}
+
+static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
+ const amd_kernel_code_t *code_object,
+ const struct pipe_grid_info *info,
+ uint64_t kernel_args_va)
+{
+ struct si_compute *program = sctx->cs_shader_state.program;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+
+ static const enum amd_code_property_mask_t workgroup_count_masks [] = {
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
+ };
+
+ unsigned i, user_sgpr = 0;
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+ if (code_object->workitem_private_segment_byte_size > 0) {
+ setup_scratch_rsrc_user_sgprs(sctx, code_object,
+ user_sgpr);
+ }
+ user_sgpr += 4;
+ }
+
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
+ struct dispatch_packet dispatch;
+ unsigned dispatch_offset;
+ struct r600_resource *dispatch_buf = NULL;
+ uint64_t dispatch_va;
+
+ /* Upload dispatch ptr */
+ memset(&dispatch, 0, sizeof(dispatch));
+
+ dispatch.workgroup_size_x = info->block[0];
+ dispatch.workgroup_size_y = info->block[1];
+ dispatch.workgroup_size_z = info->block[2];
+
+ dispatch.grid_size_x = info->grid[0] * info->block[0];
+ dispatch.grid_size_y = info->grid[1] * info->block[1];
+ dispatch.grid_size_z = info->grid[2] * info->block[2];
+
+ dispatch.private_segment_size = program->private_size;
+ dispatch.group_segment_size = program->local_size;
+
+ dispatch.kernarg_address = kernel_args_va;
+
+ u_upload_data(sctx->b.b.const_uploader, 0, sizeof(dispatch),
+ 256, &dispatch, &dispatch_offset,
+ (struct pipe_resource**)&dispatch_buf);
+
+ if (!dispatch_buf) {
+ fprintf(stderr, "Error: Failed to allocate dispatch "
+ "packet.");
+ }
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf,
+ RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+
+ dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 2);
+ radeon_emit(cs, dispatch_va);
+ radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
+ S_008F04_STRIDE(0));
+
+ r600_resource_reference(&dispatch_buf, NULL);
+ user_sgpr += 2;
+ }
+
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 2);
+ radeon_emit(cs, kernel_args_va);
+ radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
+ S_008F04_STRIDE(0));
+ user_sgpr += 2;
+ }
+
+ for (i = 0; i < 3 && user_sgpr < 16; i++) {
+ if (code_object->code_properties & workgroup_count_masks[i]) {
+ radeon_set_sh_reg_seq(cs,
+ R_00B900_COMPUTE_USER_DATA_0 +
+ (user_sgpr * 4), 1);
+ radeon_emit(cs, info->grid[i]);
+ user_sgpr += 1;
+ }
+ }