2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 * Adam Rak <adam.rak@streamnovation.com>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/u_double_list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
42 #include "evergreend.h"
43 #include "r600_resource.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "evergreen_compute_internal.h"
49 #include "compute_memory_pool.h"
51 #include "llvm_wrapper.h"
55 RAT0 is for global binding write
56 VTX1 is for global binding read
58 for wrting images RAT1...
59 for reading images TEX2...
62 TEX2... consumes the same fetch resources, that VTX2... would consume
64 CONST0 and VTX0 is for parameters
65 CONST0 is binding smaller input parameter buffer, and for constant indexing,
67 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
68 the constant cache can handle
70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
72 we should reserve another one too.=> 10 image binding for writing max.
75 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
76 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
78 so 10 for writing is enough. 176 is the max for reading according to the docs
80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
81 writable images will consume TEX slots, VTX slots too because of linear indexing
85 static void evergreen_cs_set_vertex_buffer(
86 struct r600_context
* rctx
,
89 struct pipe_resource
* buffer
)
91 struct r600_vertexbuf_state
*state
= &rctx
->cs_vertex_buffer_state
;
92 struct pipe_vertex_buffer
*vb
= &state
->vb
[vb_index
];
94 vb
->buffer_offset
= offset
;
96 vb
->user_buffer
= NULL
;
98 /* The vertex instructions in the compute shaders use the texture cache,
99 * so we need to invalidate it. */
100 rctx
->flags
|= R600_CONTEXT_INVAL_READ_CACHES
;
101 state
->enabled_mask
|= 1 << vb_index
;
102 state
->dirty_mask
|= 1 << vb_index
;
103 state
->atom
.dirty
= true;
106 static const struct u_resource_vtbl r600_global_buffer_vtbl
=
108 u_default_resource_get_handle
, /* get_handle */
109 r600_compute_global_buffer_destroy
, /* resource_destroy */
110 r600_compute_global_transfer_map
, /* transfer_map */
111 r600_compute_global_transfer_flush_region
,/* transfer_flush_region */
112 r600_compute_global_transfer_unmap
, /* transfer_unmap */
113 r600_compute_global_transfer_inline_write
/* transfer_inline_write */
117 void *evergreen_create_compute_state(
118 struct pipe_context
*ctx_
,
119 const const struct pipe_compute_state
*cso
)
121 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
122 struct r600_pipe_compute
*shader
= CALLOC_STRUCT(r600_pipe_compute
);
125 const struct pipe_llvm_program_header
* header
;
126 const unsigned char * code
;
129 COMPUTE_DBG(ctx
->screen
, "*** evergreen_create_compute_state\n");
132 code
= cso
->prog
+ sizeof(struct pipe_llvm_program_header
);
135 shader
->ctx
= (struct r600_context
*)ctx
;
136 shader
->resources
= (struct evergreen_compute_resource
*)
137 CALLOC(sizeof(struct evergreen_compute_resource
),
138 get_compute_resource_num());
139 shader
->local_size
= cso
->req_local_mem
; ///TODO: assert it
140 shader
->private_size
= cso
->req_private_mem
;
141 shader
->input_size
= cso
->req_input_mem
;
144 shader
->num_kernels
= llvm_get_num_kernels(code
, header
->num_bytes
);
145 shader
->kernels
= CALLOC(sizeof(struct r600_kernel
), shader
->num_kernels
);
147 for (i
= 0; i
< shader
->num_kernels
; i
++) {
148 struct r600_kernel
*kernel
= &shader
->kernels
[i
];
149 kernel
->llvm_module
= llvm_get_kernel_module(i
, code
,
156 void evergreen_delete_compute_state(struct pipe_context
*ctx
, void* state
)
158 struct r600_pipe_compute
*shader
= (struct r600_pipe_compute
*)state
;
160 free(shader
->resources
);
164 static void evergreen_bind_compute_state(struct pipe_context
*ctx_
, void *state
)
166 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
168 COMPUTE_DBG(ctx
->screen
, "*** evergreen_bind_compute_state\n");
170 ctx
->cs_shader_state
.shader
= (struct r600_pipe_compute
*)state
;
173 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
174 * kernel parameters there are inplicit parameters that need to be stored
175 * in the vertex buffer as well. Here is how these parameters are organized in
178 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
179 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
180 * DWORDS 6-8: Number of work items within each work group in each dimension
182 * DWORDS 9+ : Kernel parameters
184 void evergreen_compute_upload_input(
185 struct pipe_context
*ctx_
,
186 const uint
*block_layout
,
187 const uint
*grid_layout
,
190 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
191 struct r600_pipe_compute
*shader
= ctx
->cs_shader_state
.shader
;
193 unsigned kernel_parameters_offset_bytes
= 36;
194 uint32_t * num_work_groups_start
;
195 uint32_t * global_size_start
;
196 uint32_t * local_size_start
;
197 uint32_t * kernel_parameters_start
;
199 if (shader
->input_size
== 0) {
203 if (!shader
->kernel_param
) {
204 unsigned buffer_size
= shader
->input_size
;
206 /* Add space for the grid dimensions */
207 buffer_size
+= kernel_parameters_offset_bytes
* sizeof(uint
);
208 shader
->kernel_param
= r600_compute_buffer_alloc_vram(
209 ctx
->screen
, buffer_size
);
212 num_work_groups_start
= r600_buffer_mmap_sync_with_rings(ctx
, shader
->kernel_param
, PIPE_TRANSFER_WRITE
);
213 global_size_start
= num_work_groups_start
+ (3 * (sizeof(uint
) /4));
214 local_size_start
= global_size_start
+ (3 * (sizeof(uint
)) / 4);
215 kernel_parameters_start
= local_size_start
+ (3 * (sizeof(uint
)) / 4);
217 /* Copy the work group size */
218 memcpy(num_work_groups_start
, grid_layout
, 3 * sizeof(uint
));
220 /* Copy the global size */
221 for (i
= 0; i
< 3; i
++) {
222 global_size_start
[i
] = grid_layout
[i
] * block_layout
[i
];
225 /* Copy the local dimensions */
226 memcpy(local_size_start
, block_layout
, 3 * sizeof(uint
));
228 /* Copy the kernel inputs */
229 memcpy(kernel_parameters_start
, input
, shader
->input_size
);
231 for (i
= 0; i
< (kernel_parameters_offset_bytes
/ 4) +
232 (shader
->input_size
/ 4); i
++) {
233 COMPUTE_DBG(ctx
->screen
, "input %i : %i\n", i
,
234 ((unsigned*)num_work_groups_start
)[i
]);
237 ctx
->ws
->buffer_unmap(shader
->kernel_param
->cs_buf
);
239 ///ID=0 is reserved for the parameters
240 evergreen_cs_set_vertex_buffer(ctx
, 0, 0,
241 (struct pipe_resource
*)shader
->kernel_param
);
242 ///ID=0 is reserved for parameters
243 evergreen_set_const_cache(shader
, 0, shader
->kernel_param
,
244 shader
->input_size
, 0);
247 static void evergreen_emit_direct_dispatch(
248 struct r600_context
*rctx
,
249 const uint
*block_layout
, const uint
*grid_layout
)
252 struct radeon_winsys_cs
*cs
= rctx
->rings
.gfx
.cs
;
254 unsigned num_pipes
= rctx
->screen
->info
.r600_max_pipes
;
255 unsigned wave_divisor
= (16 * num_pipes
);
258 /* XXX: Enable lds and get size from cs_shader_state */
259 unsigned lds_size
= 0;
261 /* Calculate group_size/grid_size */
262 for (i
= 0; i
< 3; i
++) {
263 group_size
*= block_layout
[i
];
266 for (i
= 0; i
< 3; i
++) {
267 grid_size
*= grid_layout
[i
];
270 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
271 num_waves
= (block_layout
[0] * block_layout
[1] * block_layout
[2] +
272 wave_divisor
- 1) / wave_divisor
;
274 COMPUTE_DBG(rctx
->screen
, "Using %u pipes, there are %u wavefronts per thread block\n",
275 num_pipes
, num_waves
);
277 /* XXX: Partition the LDS between PS/CS. By default half (4096 dwords
278 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
279 * We may need to allocat the entire LDS space for Compute Shaders.
281 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
282 * CM: CM_R_0286FC_SPI_LDS_MGMT := S_0286FC_NUM_LS_LDS(lds_dwords)
285 r600_write_config_reg(cs
, R_008970_VGT_NUM_INDICES
, group_size
);
287 r600_write_config_reg_seq(cs
, R_00899C_VGT_COMPUTE_START_X
, 3);
288 r600_write_value(cs
, 0); /* R_00899C_VGT_COMPUTE_START_X */
289 r600_write_value(cs
, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
290 r600_write_value(cs
, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
292 r600_write_config_reg(cs
, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE
,
295 r600_write_compute_context_reg_seq(cs
, R_0286EC_SPI_COMPUTE_NUM_THREAD_X
, 3);
296 r600_write_value(cs
, block_layout
[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
297 r600_write_value(cs
, block_layout
[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
298 r600_write_value(cs
, block_layout
[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
300 r600_write_compute_context_reg(cs
, CM_R_0288E8_SQ_LDS_ALLOC
,
301 lds_size
| (num_waves
<< 14));
303 /* Dispatch packet */
304 r600_write_value(cs
, PKT3C(PKT3_DISPATCH_DIRECT
, 3, 0));
305 r600_write_value(cs
, grid_layout
[0]);
306 r600_write_value(cs
, grid_layout
[1]);
307 r600_write_value(cs
, grid_layout
[2]);
308 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
309 r600_write_value(cs
, 1);
312 static void compute_emit_cs(struct r600_context
*ctx
, const uint
*block_layout
,
313 const uint
*grid_layout
)
315 struct radeon_winsys_cs
*cs
= ctx
->rings
.gfx
.cs
;
316 unsigned flush_flags
= 0;
318 struct r600_resource
*onebo
= NULL
;
319 struct evergreen_compute_resource
*resources
=
320 ctx
->cs_shader_state
.shader
->resources
;
322 /* make sure that the gfx ring is only one active */
323 if (ctx
->rings
.dma
.cs
) {
324 ctx
->rings
.dma
.flush(ctx
, RADEON_FLUSH_ASYNC
);
327 /* Initialize all the compute-related registers.
329 * See evergreen_init_atom_start_compute_cs() in this file for the list
330 * of registers initialized by the start_compute_cs_cmd atom.
332 r600_emit_command_buffer(cs
, &ctx
->start_compute_cs_cmd
);
334 ctx
->flags
|= R600_CONTEXT_WAIT_3D_IDLE
| R600_CONTEXT_FLUSH_AND_INV
;
335 r600_flush_emit(ctx
);
337 /* Emit colorbuffers. */
338 for (i
= 0; i
< ctx
->framebuffer
.state
.nr_cbufs
; i
++) {
339 struct r600_surface
*cb
= (struct r600_surface
*)ctx
->framebuffer
.state
.cbufs
[i
];
340 unsigned reloc
= r600_context_bo_reloc(ctx
, &ctx
->rings
.gfx
,
341 (struct r600_resource
*)cb
->base
.texture
,
342 RADEON_USAGE_READWRITE
);
344 r600_write_compute_context_reg_seq(cs
, R_028C60_CB_COLOR0_BASE
+ i
* 0x3C, 7);
345 r600_write_value(cs
, cb
->cb_color_base
); /* R_028C60_CB_COLOR0_BASE */
346 r600_write_value(cs
, cb
->cb_color_pitch
); /* R_028C64_CB_COLOR0_PITCH */
347 r600_write_value(cs
, cb
->cb_color_slice
); /* R_028C68_CB_COLOR0_SLICE */
348 r600_write_value(cs
, cb
->cb_color_view
); /* R_028C6C_CB_COLOR0_VIEW */
349 r600_write_value(cs
, cb
->cb_color_info
); /* R_028C70_CB_COLOR0_INFO */
350 r600_write_value(cs
, cb
->cb_color_attrib
); /* R_028C74_CB_COLOR0_ATTRIB */
351 r600_write_value(cs
, cb
->cb_color_dim
); /* R_028C78_CB_COLOR0_DIM */
353 r600_write_value(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
354 r600_write_value(cs
, reloc
);
356 if (!ctx
->keep_tiling_flags
) {
357 r600_write_value(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
358 r600_write_value(cs
, reloc
);
361 r600_write_value(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
362 r600_write_value(cs
, reloc
);
365 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
366 r600_write_compute_context_reg(cs
, R_028238_CB_TARGET_MASK
,
367 ctx
->compute_cb_target_mask
);
370 /* Emit vertex buffer state */
371 ctx
->cs_vertex_buffer_state
.atom
.num_dw
= 12 * util_bitcount(ctx
->cs_vertex_buffer_state
.dirty_mask
);
372 r600_emit_atom(ctx
, &ctx
->cs_vertex_buffer_state
.atom
);
374 /* Emit compute shader state */
375 r600_emit_atom(ctx
, &ctx
->cs_shader_state
.atom
);
377 for (i
= 0; i
< get_compute_resource_num(); i
++) {
378 if (resources
[i
].enabled
) {
380 COMPUTE_DBG(ctx
->screen
, "resnum: %i, cdw: %i\n", i
, cs
->cdw
);
382 for (j
= 0; j
< resources
[i
].cs_end
; j
++) {
383 if (resources
[i
].do_reloc
[j
]) {
384 assert(resources
[i
].bo
);
385 evergreen_emit_ctx_reloc(ctx
,
390 cs
->buf
[cs
->cdw
++] = resources
[i
].cs
[j
];
393 if (resources
[i
].bo
) {
394 onebo
= resources
[i
].bo
;
395 evergreen_emit_ctx_reloc(ctx
,
399 ///special case for textures
400 if (resources
[i
].do_reloc
401 [resources
[i
].cs_end
] == 2) {
402 evergreen_emit_ctx_reloc(ctx
,
410 /* Emit dispatch state and dispatch packet */
411 evergreen_emit_direct_dispatch(ctx
, block_layout
, grid_layout
);
413 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
415 ctx
->flags
|= R600_CONTEXT_INVAL_READ_CACHES
;
416 r600_flush_emit(ctx
);
419 COMPUTE_DBG(ctx
->screen
, "cdw: %i\n", cs
->cdw
);
420 for (i
= 0; i
< cs
->cdw
; i
++) {
421 COMPUTE_DBG(ctx
->screen
, "%4i : 0x%08X\n", i
, ctx
->cs
->buf
[i
]);
425 flush_flags
= RADEON_FLUSH_ASYNC
| RADEON_FLUSH_COMPUTE
;
426 if (ctx
->keep_tiling_flags
) {
427 flush_flags
|= RADEON_FLUSH_KEEP_TILING_FLAGS
;
430 ctx
->ws
->cs_flush(ctx
->rings
.gfx
.cs
, flush_flags
);
434 COMPUTE_DBG(ctx
->screen
, "shader started\n");
436 ctx
->ws
->buffer_wait(onebo
->buf
, 0);
438 COMPUTE_DBG(ctx
->screen
, "...\n");
443 * Emit function for r600_cs_shader_state atom
445 void evergreen_emit_cs_shader(
446 struct r600_context
*rctx
,
447 struct r600_atom
*atom
)
449 struct r600_cs_shader_state
*state
=
450 (struct r600_cs_shader_state
*)atom
;
451 struct r600_pipe_compute
*shader
= state
->shader
;
452 struct r600_kernel
*kernel
= &shader
->kernels
[state
->kernel_index
];
453 struct radeon_winsys_cs
*cs
= rctx
->rings
.gfx
.cs
;
456 va
= r600_resource_va(&rctx
->screen
->screen
, &kernel
->code_bo
->b
.b
);
458 r600_write_compute_context_reg_seq(cs
, R_0288D0_SQ_PGM_START_LS
, 3);
459 r600_write_value(cs
, va
>> 8); /* R_0288D0_SQ_PGM_START_LS */
460 r600_write_value(cs
, /* R_0288D4_SQ_PGM_RESOURCES_LS */
461 S_0288D4_NUM_GPRS(kernel
->bc
.ngpr
)
462 | S_0288D4_STACK_SIZE(kernel
->bc
.nstack
));
463 r600_write_value(cs
, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
465 r600_write_value(cs
, PKT3C(PKT3_NOP
, 0, 0));
466 r600_write_value(cs
, r600_context_bo_reloc(rctx
, &rctx
->rings
.gfx
,
467 kernel
->code_bo
, RADEON_USAGE_READ
));
469 rctx
->flags
|= R600_CONTEXT_INVAL_READ_CACHES
;
472 static void evergreen_launch_grid(
473 struct pipe_context
*ctx_
,
474 const uint
*block_layout
, const uint
*grid_layout
,
475 uint32_t pc
, const void *input
)
477 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
480 COMPUTE_DBG(ctx
->screen
, "*** evergreen_launch_grid: pc = %u\n", pc
);
482 struct r600_pipe_compute
*shader
= ctx
->cs_shader_state
.shader
;
483 if (!shader
->kernels
[pc
].code_bo
) {
485 struct r600_kernel
*kernel
= &shader
->kernels
[pc
];
486 r600_compute_shader_create(ctx_
, kernel
->llvm_module
, &kernel
->bc
);
487 kernel
->code_bo
= r600_compute_buffer_alloc_vram(ctx
->screen
,
489 p
= r600_buffer_mmap_sync_with_rings(ctx
, kernel
->code_bo
, PIPE_TRANSFER_WRITE
);
490 memcpy(p
, kernel
->bc
.bytecode
, kernel
->bc
.ndw
* 4);
491 ctx
->ws
->buffer_unmap(kernel
->code_bo
->cs_buf
);
495 ctx
->cs_shader_state
.kernel_index
= pc
;
496 evergreen_compute_upload_input(ctx_
, block_layout
, grid_layout
, input
);
497 compute_emit_cs(ctx
, block_layout
, grid_layout
);
500 static void evergreen_set_compute_resources(struct pipe_context
* ctx_
,
501 unsigned start
, unsigned count
,
502 struct pipe_surface
** surfaces
)
504 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
505 struct r600_surface
**resources
= (struct r600_surface
**)surfaces
;
507 COMPUTE_DBG(ctx
->screen
, "*** evergreen_set_compute_resources: start = %u count = %u\n",
510 for (int i
= 0; i
< count
; i
++) {
511 /* The First two vertex buffers are reserved for parameters and
513 unsigned vtx_id
= 2 + i
;
515 struct r600_resource_global
*buffer
=
516 (struct r600_resource_global
*)
517 resources
[i
]->base
.texture
;
518 if (resources
[i
]->base
.writable
) {
521 evergreen_set_rat(ctx
->cs_shader_state
.shader
, i
+1,
522 (struct r600_resource
*)resources
[i
]->base
.texture
,
523 buffer
->chunk
->start_in_dw
*4,
524 resources
[i
]->base
.texture
->width0
);
527 evergreen_cs_set_vertex_buffer(ctx
, vtx_id
,
528 buffer
->chunk
->start_in_dw
* 4,
529 resources
[i
]->base
.texture
);
534 static void evergreen_set_cs_sampler_view(struct pipe_context
*ctx_
,
535 unsigned start_slot
, unsigned count
,
536 struct pipe_sampler_view
**views
)
538 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
539 struct r600_pipe_sampler_view
**resource
=
540 (struct r600_pipe_sampler_view
**)views
;
542 for (int i
= 0; i
< count
; i
++) {
545 ///FETCH0 = VTX0 (param buffer),
546 //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
547 evergreen_set_tex_resource(ctx
->cs_shader_state
.shader
, resource
[i
], i
+2);
552 static void evergreen_bind_compute_sampler_states(
553 struct pipe_context
*ctx_
,
555 unsigned num_samplers
,
558 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
559 struct compute_sampler_state
** samplers
=
560 (struct compute_sampler_state
**)samplers_
;
562 for (int i
= 0; i
< num_samplers
; i
++) {
564 evergreen_set_sampler_resource(
565 ctx
->cs_shader_state
.shader
, samplers
[i
], i
);
570 static void evergreen_set_global_binding(
571 struct pipe_context
*ctx_
, unsigned first
, unsigned n
,
572 struct pipe_resource
**resources
,
575 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
576 struct compute_memory_pool
*pool
= ctx
->screen
->global_pool
;
577 struct r600_resource_global
**buffers
=
578 (struct r600_resource_global
**)resources
;
580 COMPUTE_DBG(ctx
->screen
, "*** evergreen_set_global_binding first = %u n = %u\n",
588 compute_memory_finalize_pending(pool
, ctx_
);
590 for (int i
= 0; i
< n
; i
++)
592 assert(resources
[i
]->target
== PIPE_BUFFER
);
593 assert(resources
[i
]->bind
& PIPE_BIND_GLOBAL
);
595 *(handles
[i
]) = buffers
[i
]->chunk
->start_in_dw
* 4;
598 evergreen_set_rat(ctx
->cs_shader_state
.shader
, 0, pool
->bo
, 0, pool
->size_in_dw
* 4);
599 evergreen_cs_set_vertex_buffer(ctx
, 1, 0,
600 (struct pipe_resource
*)pool
->bo
);
604 * This function initializes all the compute specific registers that need to
605 * be initialized for each compute command stream. Registers that are common
606 * to both compute and 3D will be initialized at the beginning of each compute
607 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
608 * packet requires that the shader type bit be set, we must initialize all
609 * context registers needed for compute in this function. The registers
610 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
611 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
614 void evergreen_init_atom_start_compute_cs(struct r600_context
*ctx
)
616 struct r600_command_buffer
*cb
= &ctx
->start_compute_cs_cmd
;
618 int num_stack_entries
;
620 /* since all required registers are initialised in the
621 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
623 r600_init_command_buffer(cb
, 256);
624 cb
->pkt_flags
= RADEON_CP_PACKET3_COMPUTE_MODE
;
626 /* This must be first. */
627 r600_store_value(cb
, PKT3(PKT3_CONTEXT_CONTROL
, 1, 0));
628 r600_store_value(cb
, 0x80000000);
629 r600_store_value(cb
, 0x80000000);
631 /* We're setting config registers here. */
632 r600_store_value(cb
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
633 r600_store_value(cb
, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH
) | EVENT_INDEX(4));
635 switch (ctx
->family
) {
639 num_stack_entries
= 256;
643 num_stack_entries
= 256;
647 num_stack_entries
= 512;
652 num_stack_entries
= 512;
656 num_stack_entries
= 256;
660 num_stack_entries
= 256;
664 num_stack_entries
= 512;
668 num_stack_entries
= 512;
672 num_stack_entries
= 256;
676 num_stack_entries
= 256;
680 /* Config Registers */
681 if (ctx
->chip_class
< CAYMAN
)
682 evergreen_init_common_regs(cb
, ctx
->chip_class
, ctx
->family
,
683 ctx
->screen
->info
.drm_minor
);
685 cayman_init_common_regs(cb
, ctx
->chip_class
, ctx
->family
,
686 ctx
->screen
->info
.drm_minor
);
688 /* The primitive type always needs to be POINTLIST for compute. */
689 r600_store_config_reg(cb
, R_008958_VGT_PRIMITIVE_TYPE
,
690 V_008958_DI_PT_POINTLIST
);
692 if (ctx
->chip_class
< CAYMAN
) {
694 /* These registers control which simds can be used by each stage.
695 * The default for these registers is 0xffffffff, which means
696 * all simds are available for each stage. It's possible we may
697 * want to play around with these in the future, but for now
698 * the default value is fine.
700 * R_008E20_SQ_STATIC_THREAD_MGMT1
701 * R_008E24_SQ_STATIC_THREAD_MGMT2
702 * R_008E28_SQ_STATIC_THREAD_MGMT3
705 /* XXX: We may need to adjust the thread and stack resouce
706 * values for 3D/compute interop */
708 r600_store_config_reg_seq(cb
, R_008C18_SQ_THREAD_RESOURCE_MGMT_1
, 5);
710 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
711 * Set the number of threads used by the PS/VS/GS/ES stage to
714 r600_store_value(cb
, 0);
716 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
717 * Set the number of threads used by the CS (aka LS) stage to
718 * the maximum number of threads and set the number of threads
719 * for the HS stage to 0. */
720 r600_store_value(cb
, S_008C1C_NUM_LS_THREADS(num_threads
));
722 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
723 * Set the Control Flow stack entries to 0 for PS/VS stages */
724 r600_store_value(cb
, 0);
726 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
727 * Set the Control Flow stack entries to 0 for GS/ES stages */
728 r600_store_value(cb
, 0);
730 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
731 * Set the Contol Flow stack entries to 0 for the HS stage, and
732 * set it to the maximum value for the CS (aka LS) stage. */
734 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries
));
737 /* Context Registers */
739 if (ctx
->chip_class
< CAYMAN
) {
740 /* workaround for hw issues with dyn gpr - must set all limits
741 * to 240 instead of 0, 0x1e == 240 / 8
743 r600_store_context_reg(cb
, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1
,
744 S_028838_PS_GPRS(0x1e) |
745 S_028838_VS_GPRS(0x1e) |
746 S_028838_GS_GPRS(0x1e) |
747 S_028838_ES_GPRS(0x1e) |
748 S_028838_HS_GPRS(0x1e) |
749 S_028838_LS_GPRS(0x1e));
752 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
753 r600_store_context_reg(cb
, R_028A40_VGT_GS_MODE
,
754 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
756 r600_store_context_reg(cb
, R_028B54_VGT_SHADER_STAGES_EN
, 2/*CS_ON*/);
758 r600_store_context_reg(cb
, R_0286E8_SPI_COMPUTE_INPUT_CNTL
,
759 S_0286E8_TID_IN_GROUP_ENA
761 | S_0286E8_DISABLE_INDEX_PACK
)
764 /* The LOOP_CONST registers are an optimizations for loops that allows
765 * you to store the initial counter, increment value, and maximum
766 * counter value in a register so that hardware can calculate the
767 * correct number of iterations for the loop, so that you don't need
768 * to have the loop counter in your shader code. We don't currently use
769 * this optimization, so we must keep track of the counter in the
770 * shader and use a break instruction to exit loops. However, the
771 * hardware will still uses this register to determine when to exit a
772 * loop, so we need to initialize the counter to 0, set the increment
773 * value to 1 and the maximum counter value to the 4095 (0xfff) which
774 * is the maximum value allowed. This gives us a maximum of 4096
775 * iterations for our loops, but hopefully our break instruction will
776 * execute before some time before the 4096th iteration.
778 eg_store_loop_const(cb
, R_03A200_SQ_LOOP_CONST_0
+ (160 * 4), 0x1000FFF);
781 void evergreen_init_compute_state_functions(struct r600_context
*ctx
)
783 ctx
->context
.create_compute_state
= evergreen_create_compute_state
;
784 ctx
->context
.delete_compute_state
= evergreen_delete_compute_state
;
785 ctx
->context
.bind_compute_state
= evergreen_bind_compute_state
;
786 // ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
787 ctx
->context
.set_compute_resources
= evergreen_set_compute_resources
;
788 ctx
->context
.set_compute_sampler_views
= evergreen_set_cs_sampler_view
;
789 ctx
->context
.bind_compute_sampler_states
= evergreen_bind_compute_sampler_states
;
790 ctx
->context
.set_global_binding
= evergreen_set_global_binding
;
791 ctx
->context
.launch_grid
= evergreen_launch_grid
;
793 /* We always use at least two vertex buffers for compute, one for
794 * parameters and one for global memory */
795 ctx
->cs_vertex_buffer_state
.enabled_mask
=
796 ctx
->cs_vertex_buffer_state
.dirty_mask
= 1 | 2;
800 struct pipe_resource
*r600_compute_global_buffer_create(
801 struct pipe_screen
*screen
,
802 const struct pipe_resource
*templ
)
804 struct r600_resource_global
* result
= NULL
;
805 struct r600_screen
* rscreen
= NULL
;
808 assert(templ
->target
== PIPE_BUFFER
);
809 assert(templ
->bind
& PIPE_BIND_GLOBAL
);
810 assert(templ
->array_size
== 1 || templ
->array_size
== 0);
811 assert(templ
->depth0
== 1 || templ
->depth0
== 0);
812 assert(templ
->height0
== 1 || templ
->height0
== 0);
814 result
= (struct r600_resource_global
*)
815 CALLOC(sizeof(struct r600_resource_global
), 1);
816 rscreen
= (struct r600_screen
*)screen
;
818 COMPUTE_DBG(rscreen
, "*** r600_compute_global_buffer_create\n");
819 COMPUTE_DBG(rscreen
, "width = %u array_size = %u\n", templ
->width0
,
822 result
->base
.b
.vtbl
= &r600_global_buffer_vtbl
;
823 result
->base
.b
.b
.screen
= screen
;
824 result
->base
.b
.b
= *templ
;
825 pipe_reference_init(&result
->base
.b
.b
.reference
, 1);
827 size_in_dw
= (templ
->width0
+3) / 4;
829 result
->chunk
= compute_memory_alloc(rscreen
->global_pool
, size_in_dw
);
831 if (result
->chunk
== NULL
)
837 return &result
->base
.b
.b
;
840 void r600_compute_global_buffer_destroy(
841 struct pipe_screen
*screen
,
842 struct pipe_resource
*res
)
844 struct r600_resource_global
* buffer
= NULL
;
845 struct r600_screen
* rscreen
= NULL
;
847 assert(res
->target
== PIPE_BUFFER
);
848 assert(res
->bind
& PIPE_BIND_GLOBAL
);
850 buffer
= (struct r600_resource_global
*)res
;
851 rscreen
= (struct r600_screen
*)screen
;
853 compute_memory_free(rscreen
->global_pool
, buffer
->chunk
->id
);
855 buffer
->chunk
= NULL
;
859 void *r600_compute_global_transfer_map(
860 struct pipe_context
*ctx_
,
861 struct pipe_resource
*resource
,
864 const struct pipe_box
*box
,
865 struct pipe_transfer
**ptransfer
)
867 struct r600_context
*rctx
= (struct r600_context
*)ctx_
;
868 struct compute_memory_pool
*pool
= rctx
->screen
->global_pool
;
869 struct pipe_transfer
*transfer
= util_slab_alloc(&rctx
->pool_transfers
);
870 struct r600_resource_global
* buffer
=
871 (struct r600_resource_global
*)resource
;
874 compute_memory_finalize_pending(pool
, ctx_
);
876 assert(resource
->target
== PIPE_BUFFER
);
878 COMPUTE_DBG(rctx
->screen
, "* r600_compute_global_get_transfer()\n"
879 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
880 "width = %u, height = %u, depth = %u)\n", level
, usage
,
881 box
->x
, box
->y
, box
->z
, box
->width
, box
->height
,
884 transfer
->resource
= resource
;
885 transfer
->level
= level
;
886 transfer
->usage
= usage
;
887 transfer
->box
= *box
;
888 transfer
->stride
= 0;
889 transfer
->layer_stride
= 0;
891 assert(transfer
->resource
->target
== PIPE_BUFFER
);
892 assert(transfer
->resource
->bind
& PIPE_BIND_GLOBAL
);
893 assert(transfer
->box
.x
>= 0);
894 assert(transfer
->box
.y
== 0);
895 assert(transfer
->box
.z
== 0);
897 ///TODO: do it better, mapping is not possible if the pool is too big
899 COMPUTE_DBG(rctx
->screen
, "* r600_compute_global_transfer_map()\n");
901 if (!(map
= r600_buffer_mmap_sync_with_rings(rctx
, buffer
->chunk
->pool
->bo
, transfer
->usage
))) {
902 util_slab_free(&rctx
->pool_transfers
, transfer
);
906 *ptransfer
= transfer
;
908 COMPUTE_DBG(rctx
->screen
, "Buffer: %p + %u (buffer offset in global memory) "
909 "+ %u (box.x)\n", map
, buffer
->chunk
->start_in_dw
, transfer
->box
.x
);
910 return ((char*)(map
+ buffer
->chunk
->start_in_dw
)) + transfer
->box
.x
;
913 void r600_compute_global_transfer_unmap(
914 struct pipe_context
*ctx_
,
915 struct pipe_transfer
* transfer
)
917 struct r600_context
*ctx
= NULL
;
918 struct r600_resource_global
* buffer
= NULL
;
920 assert(transfer
->resource
->target
== PIPE_BUFFER
);
921 assert(transfer
->resource
->bind
& PIPE_BIND_GLOBAL
);
923 ctx
= (struct r600_context
*)ctx_
;
924 buffer
= (struct r600_resource_global
*)transfer
->resource
;
926 COMPUTE_DBG(ctx
->screen
, "* r600_compute_global_transfer_unmap()\n");
928 ctx
->ws
->buffer_unmap(buffer
->chunk
->pool
->bo
->cs_buf
);
929 util_slab_free(&ctx
->pool_transfers
, transfer
);
932 void r600_compute_global_transfer_flush_region(
933 struct pipe_context
*ctx_
,
934 struct pipe_transfer
*transfer
,
935 const struct pipe_box
*box
)
940 void r600_compute_global_transfer_inline_write(
941 struct pipe_context
*pipe
,
942 struct pipe_resource
*resource
,
945 const struct pipe_box
*box
,
948 unsigned layer_stride
)