+ return true;
+}
+
+static void si_setup_tgsi_grid(struct si_context *sctx,
+ const struct pipe_grid_info *info)
+{
+ struct si_compute *program = sctx->cs_shader_state.program;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
+ 4 * SI_NUM_RESOURCE_SGPRS;
+ unsigned block_size_reg = grid_size_reg +
+ /* 12 bytes = 3 dwords. */
+ 12 * program->uses_grid_size;
+
+ if (info->indirect) {
+ if (program->uses_grid_size) {
+ uint64_t base_va = r600_resource(info->indirect)->gpu_address;
+ uint64_t va = base_va + info->indirect_offset;
+ int i;
+
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ (struct r600_resource *)info->indirect,
+ RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+ for (i = 0; i < 3; ++i) {
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+ COPY_DATA_DST_SEL(COPY_DATA_REG));
+ radeon_emit(cs, (va + 4 * i));
+ radeon_emit(cs, (va + 4 * i) >> 32);
+ radeon_emit(cs, (grid_size_reg >> 2) + i);
+ radeon_emit(cs, 0);
+ }
+ }
+ } else {
+ if (program->uses_grid_size) {
+ radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+ radeon_emit(cs, info->grid[0]);
+ radeon_emit(cs, info->grid[1]);
+ radeon_emit(cs, info->grid[2]);
+ }
+ if (program->variable_group_size && program->uses_block_size) {
+ radeon_set_sh_reg_seq(cs, block_size_reg, 3);
+ radeon_emit(cs, info->block[0]);
+ radeon_emit(cs, info->block[1]);
+ radeon_emit(cs, info->block[2]);
+ }
+ }
+}
+
+static void si_emit_dispatch_packets(struct si_context *sctx,
+ const struct pipe_grid_info *info)
+{
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
+ unsigned waves_per_threadgroup =
+ DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
+
+ radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+ S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0));
+
+ radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+ radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+ radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+ radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+
+ unsigned dispatch_initiator =
+ S_00B800_COMPUTE_SHADER_EN(1) |
+ S_00B800_FORCE_START_AT_000(1);
+
+ if (info->indirect) {
+ uint64_t base_va = r600_resource(info->indirect)->gpu_address;
+
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ (struct r600_resource *)info->indirect,
+ RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+ radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, 1);
+ radeon_emit(cs, base_va);
+ radeon_emit(cs, base_va >> 32);
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, info->indirect_offset);
+ radeon_emit(cs, dispatch_initiator);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, info->grid[0]);
+ radeon_emit(cs, info->grid[1]);
+ radeon_emit(cs, info->grid[2]);
+ radeon_emit(cs, dispatch_initiator);
+ }
+}
+
+
+static void si_launch_grid(
+ struct pipe_context *ctx, const struct pipe_grid_info *info)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct si_compute *program = sctx->cs_shader_state.program;
+ const amd_kernel_code_t *code_object =
+ si_compute_get_code_object(program, info->pc);
+ int i;
+ /* HW bug workaround when CS threadgroups > 256 threads and async
+ * compute isn't used, i.e. only one compute job can run at a time.
+ * If async compute is possible, the threadgroup size must be limited
+ * to 256 threads on all queues to avoid the bug.
+ * Only SI and certain CIK chips are affected.
+ */
+ bool cs_regalloc_hang =
+ (sctx->b.chip_class == SI ||
+ sctx->b.family == CHIP_BONAIRE ||
+ sctx->b.family == CHIP_KABINI) &&
+ info->block[0] * info->block[1] * info->block[2] > 256;
+
+ if (cs_regalloc_hang)
+ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+ if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+ program->shader.compilation_failed)
+ return;
+
+ si_decompress_compute_textures(sctx);
+
+ /* Add buffer sizes for memory checking in need_cs_space. */
+ r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
+ /* TODO: add the scratch buffer */
+
+ if (info->indirect) {
+ r600_context_add_resource_size(ctx, info->indirect);
+
+ /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+ if (sctx->b.chip_class <= VI &&
+ r600_resource(info->indirect)->TC_L2_dirty) {
+ sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+ r600_resource(info->indirect)->TC_L2_dirty = false;
+ }
+ }
+
+ si_need_cs_space(sctx);
+
+ if (!sctx->cs_shader_state.initialized)
+ si_initialize_compute(sctx);
+
+ if (sctx->b.flags)
+ si_emit_cache_flush(sctx);
+
+ if (!si_switch_compute_shader(sctx, program, &program->shader,
+ code_object, info->pc))
+ return;
+
+ si_upload_compute_shader_descriptors(sctx);
+ si_emit_compute_shader_userdata(sctx);
+
+ if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
+ sctx->atoms.s.render_cond->emit(&sctx->b,
+ sctx->atoms.s.render_cond);
+ si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
+ }
+
+ if ((program->input_size ||
+ program->ir_type == PIPE_SHADER_IR_NATIVE) &&
+ unlikely(!si_upload_compute_input(sctx, code_object, info))) {
+ return;
+ }