+ unsigned dispatch_initiator =
+ S_00B800_COMPUTE_SHADER_EN(1) |
+ S_00B800_FORCE_START_AT_000(1) |
+ /* If the KMD allows it (there is a KMD hw register for it),
+ * allow launching waves out-of-order. (same as Vulkan) */
+ S_00B800_ORDER_MODE(sctx->b.chip_class >= CIK);
+
+ if (info->indirect) {
+ uint64_t base_va = r600_resource(info->indirect)->gpu_address;
+
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ (struct r600_resource *)info->indirect,
+ RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+ radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, 1);
+ radeon_emit(cs, base_va);
+ radeon_emit(cs, base_va >> 32);
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, info->indirect_offset);
+ radeon_emit(cs, dispatch_initiator);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, info->grid[0]);
+ radeon_emit(cs, info->grid[1]);
+ radeon_emit(cs, info->grid[2]);
+ radeon_emit(cs, dispatch_initiator);
+ }
+}
+
+
+static void si_launch_grid(
+ struct pipe_context *ctx, const struct pipe_grid_info *info)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct si_compute *program = sctx->cs_shader_state.program;
+ const amd_kernel_code_t *code_object =
+ si_compute_get_code_object(program, info->pc);
+ int i;
+ /* HW bug workaround when CS threadgroups > 256 threads and async
+ * compute isn't used, i.e. only one compute job can run at a time.
+ * If async compute is possible, the threadgroup size must be limited
+ * to 256 threads on all queues to avoid the bug.
+ * Only SI and certain CIK chips are affected.
+ */
+ bool cs_regalloc_hang =
+ (sctx->b.chip_class == SI ||
+ sctx->b.family == CHIP_BONAIRE ||
+ sctx->b.family == CHIP_KABINI) &&
+ info->block[0] * info->block[1] * info->block[2] > 256;
+
+ if (cs_regalloc_hang)
+ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+ if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+ program->shader.compilation_failed)
+ return;
+
+ si_decompress_compute_textures(sctx);
+
+ /* Add buffer sizes for memory checking in need_cs_space. */
+ r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
+ /* TODO: add the scratch buffer */
+
+ if (info->indirect) {
+ r600_context_add_resource_size(ctx, info->indirect);
+
+ /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+ if (sctx->b.chip_class <= VI &&
+ r600_resource(info->indirect)->TC_L2_dirty) {
+ sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+ r600_resource(info->indirect)->TC_L2_dirty = false;
+ }
+ }
+
+ si_need_cs_space(sctx);
+
+ if (!sctx->cs_shader_state.initialized)
+ si_initialize_compute(sctx);
+
+ if (sctx->b.flags)
+ si_emit_cache_flush(sctx);
+
+ if (!si_switch_compute_shader(sctx, program, &program->shader,
+ code_object, info->pc))
+ return;
+
+ si_upload_compute_shader_descriptors(sctx);
+ si_emit_compute_shader_userdata(sctx);
+
+ if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
+ sctx->atoms.s.render_cond->emit(&sctx->b,
+ sctx->atoms.s.render_cond);
+ si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
+ }
+
+ if ((program->input_size ||
+ program->ir_type == PIPE_SHADER_IR_NATIVE) &&
+ unlikely(!si_upload_compute_input(sctx, code_object, info))) {
+ return;
+ }