1 #include "util/u_memory.h"
3 #include "radeonsi_pipe.h"
4 #include "radeonsi_shader.h"
6 #include "radeon_llvm_util.h"
8 struct si_pipe_compute
{
9 struct r600_context
*ctx
;
12 unsigned private_size
;
14 struct si_pipe_shader shader
;
15 unsigned num_user_sgprs
;
17 struct si_pm4_state
*pm4_buffers
;
21 static void *radeonsi_create_compute_state(
22 struct pipe_context
*ctx
,
23 const struct pipe_compute_state
*cso
)
25 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
26 struct si_pipe_compute
*program
=
27 CALLOC_STRUCT(si_pipe_compute
);
28 const struct pipe_llvm_program_header
*header
;
29 const unsigned char *code
;
33 code
= cso
->prog
+ sizeof(struct pipe_llvm_program_header
);
36 program
->local_size
= cso
->req_local_mem
;
37 program
->private_size
= cso
->req_private_mem
;
38 program
->input_size
= cso
->req_input_mem
;
40 mod
= radeon_llvm_parse_bitcode(code
, header
->num_bytes
);
41 si_compile_llvm(rctx
, &program
->shader
, mod
);
46 static void radeonsi_bind_compute_state(struct pipe_context
*ctx
, void *state
)
48 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
49 rctx
->cs_shader_state
.program
= (struct si_pipe_compute
*)state
;
52 static void radeonsi_set_global_binding(
53 struct pipe_context
*ctx
, unsigned first
, unsigned n
,
54 struct pipe_resource
**resources
,
58 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
59 struct si_pipe_compute
*program
= rctx
->cs_shader_state
.program
;
60 struct si_pm4_state
*pm4
;
62 if (!program
->pm4_buffers
) {
63 program
->pm4_buffers
= CALLOC_STRUCT(si_pm4_state
);
65 pm4
= program
->pm4_buffers
;
66 pm4
->compute_pkt
= true;
72 for (i
= first
; i
< first
+ n
; i
++) {
73 uint64_t va
= r600_resource_va(ctx
->screen
, resources
[i
]);
74 si_pm4_add_bo(pm4
, (struct si_resource
*)resources
[i
],
75 RADEON_USAGE_READWRITE
);
76 memcpy(handles
[i
], &va
, sizeof(va
));
80 static void radeonsi_launch_grid(
81 struct pipe_context
*ctx
,
82 const uint
*block_layout
, const uint
*grid_layout
,
83 uint32_t pc
, const void *input
)
85 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
86 struct si_pipe_compute
*program
= rctx
->cs_shader_state
.program
;
87 struct si_pm4_state
*pm4
= CALLOC_STRUCT(si_pm4_state
);
89 unsigned arg_user_sgpr_count
;
92 pm4
->compute_pkt
= true;
93 si_cmd_context_control(pm4
);
95 si_pm4_cmd_begin(pm4
, PKT3_EVENT_WRITE
);
96 si_pm4_cmd_add(pm4
, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH
) |
99 si_pm4_cmd_end(pm4
, false);
101 si_pm4_inval_texture_cache(pm4
);
102 si_pm4_inval_shader_cache(pm4
);
103 si_cmd_surface_sync(pm4
, pm4
->cp_coher_cntl
);
105 arg_user_sgpr_count
= program
->input_size
/ 4;
106 if (program
->input_size
% 4 != 0) {
107 arg_user_sgpr_count
++;
110 /* XXX: We should store arguments in memory if we run out of user sgprs.
112 assert(arg_user_sgpr_count
< 16);
114 for (i
= 0; i
< arg_user_sgpr_count
; i
++) {
115 uint32_t *args
= (uint32_t*)input
;
116 si_pm4_set_reg(pm4
, R_00B900_COMPUTE_USER_DATA_0
+
121 si_pm4_set_reg(pm4
, R_00B810_COMPUTE_START_X
, 0);
122 si_pm4_set_reg(pm4
, R_00B814_COMPUTE_START_Y
, 0);
123 si_pm4_set_reg(pm4
, R_00B818_COMPUTE_START_Z
, 0);
125 si_pm4_set_reg(pm4
, R_00B81C_COMPUTE_NUM_THREAD_X
,
126 S_00B81C_NUM_THREAD_FULL(block_layout
[0]));
127 si_pm4_set_reg(pm4
, R_00B820_COMPUTE_NUM_THREAD_Y
,
128 S_00B820_NUM_THREAD_FULL(block_layout
[1]));
129 si_pm4_set_reg(pm4
, R_00B824_COMPUTE_NUM_THREAD_Z
,
130 S_00B824_NUM_THREAD_FULL(block_layout
[2]));
132 /* XXX: This should be:
133 * (number of compute units) * 4 * (waves per simd) - 1 */
134 si_pm4_set_reg(pm4
, R_00B82C_COMPUTE_MAX_WAVE_ID
, 0x190 /* Default value */);
136 shader_va
= r600_resource_va(ctx
->screen
, (void *)program
->shader
.bo
);
137 si_pm4_add_bo(pm4
, program
->shader
.bo
, RADEON_USAGE_READ
);
138 si_pm4_set_reg(pm4
, R_00B830_COMPUTE_PGM_LO
, (shader_va
>> 8) & 0xffffffff);
139 si_pm4_set_reg(pm4
, R_00B834_COMPUTE_PGM_HI
, shader_va
>> 40);
141 si_pm4_set_reg(pm4
, R_00B848_COMPUTE_PGM_RSRC1
,
142 /* We always use at least 3 VGPRS, these come from
144 * XXX: The compiler should account for this.
146 S_00B848_VGPRS((MAX2(3, program
->shader
.num_vgprs
) - 1) / 4)
147 /* We always use at least 4 + arg_user_sgpr_count. The 4 extra
148 * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN
149 * XXX: The compiler should account for this.
151 | S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count
,
152 program
->shader
.num_sgprs
)) - 1) / 8))
155 si_pm4_set_reg(pm4
, R_00B84C_COMPUTE_PGM_RSRC2
,
156 S_00B84C_SCRATCH_EN(0)
157 | S_00B84C_USER_SGPR(arg_user_sgpr_count
)
158 | S_00B84C_TGID_X_EN(1)
159 | S_00B84C_TGID_Y_EN(1)
160 | S_00B84C_TGID_Z_EN(1)
161 | S_00B84C_TG_SIZE_EN(1)
162 | S_00B84C_TIDIG_COMP_CNT(2)
163 | S_00B84C_LDS_SIZE(0)
164 | S_00B84C_EXCP_EN(0))
166 si_pm4_set_reg(pm4
, R_00B854_COMPUTE_RESOURCE_LIMITS
, 0);
168 si_pm4_set_reg(pm4
, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0
,
169 S_00B858_SH0_CU_EN(0xffff /* Default value */)
170 | S_00B858_SH1_CU_EN(0xffff /* Default value */))
173 si_pm4_set_reg(pm4
, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1
,
174 S_00B85C_SH0_CU_EN(0xffff /* Default value */)
175 | S_00B85C_SH1_CU_EN(0xffff /* Default value */))
178 si_pm4_cmd_begin(pm4
, PKT3_DISPATCH_DIRECT
);
179 si_pm4_cmd_add(pm4
, grid_layout
[0]); /* Thread groups DIM_X */
180 si_pm4_cmd_add(pm4
, grid_layout
[1]); /* Thread groups DIM_Y */
181 si_pm4_cmd_add(pm4
, grid_layout
[2]); /* Thread gropus DIM_Z */
182 si_pm4_cmd_add(pm4
, 1); /* DISPATCH_INITIATOR */
183 si_pm4_cmd_end(pm4
, false);
185 si_pm4_cmd_begin(pm4
, PKT3_EVENT_WRITE
);
186 si_pm4_cmd_add(pm4
, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH
| EVENT_INDEX(0x4)));
187 si_pm4_cmd_end(pm4
, false);
189 si_pm4_inval_texture_cache(pm4
);
190 si_pm4_inval_shader_cache(pm4
);
191 si_cmd_surface_sync(pm4
, pm4
->cp_coher_cntl
);
193 si_pm4_emit(rctx
, program
->pm4_buffers
);
194 si_pm4_emit(rctx
, pm4
);
197 fprintf(stderr
, "cdw: %i\n", rctx
->cs
->cdw
);
198 for (i
= 0; i
< rctx
->cs
->cdw
; i
++) {
199 fprintf(stderr
, "%4i : 0x%08X\n", i
, rctx
->cs
->buf
[i
]);
203 rctx
->ws
->cs_flush(rctx
->cs
, RADEON_FLUSH_COMPUTE
, 0);
204 rctx
->ws
->buffer_wait(program
->shader
.bo
->buf
, 0);
210 static void si_delete_compute_state(struct pipe_context
*ctx
, void* state
){}
211 static void si_set_compute_resources(struct pipe_context
* ctx_
,
212 unsigned start
, unsigned count
,
213 struct pipe_surface
** surfaces
) { }
214 static void si_set_cs_sampler_view(struct pipe_context
*ctx_
,
215 unsigned start_slot
, unsigned count
,
216 struct pipe_sampler_view
**views
) { }
218 static void si_bind_compute_sampler_states(
219 struct pipe_context
*ctx_
,
221 unsigned num_samplers
,
222 void **samplers_
) { }
223 void si_init_compute_functions(struct r600_context
*rctx
)
225 rctx
->context
.create_compute_state
= radeonsi_create_compute_state
;
226 rctx
->context
.delete_compute_state
= si_delete_compute_state
;
227 rctx
->context
.bind_compute_state
= radeonsi_bind_compute_state
;
228 /* ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; */
229 rctx
->context
.set_compute_resources
= si_set_compute_resources
;
230 rctx
->context
.set_compute_sampler_views
= si_set_cs_sampler_view
;
231 rctx
->context
.bind_compute_sampler_states
= si_bind_compute_sampler_states
;
232 rctx
->context
.set_global_binding
= radeonsi_set_global_binding
;
233 rctx
->context
.launch_grid
= radeonsi_launch_grid
;