2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 * Adam Rak <adam.rak@streamnovation.com>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "evergreend.h"
42 #include "r600_shader.h"
43 #include "r600_pipe.h"
44 #include "r600_formats.h"
45 #include "evergreen_compute.h"
46 #include "evergreen_compute_internal.h"
47 #include "compute_memory_pool.h"
48 #include "sb/sb_public.h"
50 #include "radeon/radeon_llvm_util.h"
52 #include "radeon/radeon_elf_util.h"
56 RAT0 is for global binding write
57 VTX1 is for global binding read
59 for wrting images RAT1...
60 for reading images TEX2...
63 TEX2... consumes the same fetch resources, that VTX2... would consume
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
79 so 10 for writing is enough. 176 is the max for reading according to the docs
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
86 struct r600_resource
*r600_compute_buffer_alloc_vram(struct r600_screen
*screen
,
89 struct pipe_resource
*buffer
= NULL
;
92 buffer
= pipe_buffer_create((struct pipe_screen
*) screen
,
97 return (struct r600_resource
*)buffer
;
101 static void evergreen_set_rat(struct r600_pipe_compute
*pipe
,
103 struct r600_resource
*bo
,
107 struct pipe_surface rat_templ
;
108 struct r600_surface
*surf
= NULL
;
109 struct r600_context
*rctx
= NULL
;
112 assert((size
& 3) == 0);
113 assert((start
& 0xFF) == 0);
117 COMPUTE_DBG(rctx
->screen
, "bind rat: %i \n", id
);
119 /* Create the RAT surface */
120 memset(&rat_templ
, 0, sizeof(rat_templ
));
121 rat_templ
.format
= PIPE_FORMAT_R32_UINT
;
122 rat_templ
.u
.tex
.level
= 0;
123 rat_templ
.u
.tex
.first_layer
= 0;
124 rat_templ
.u
.tex
.last_layer
= 0;
126 /* Add the RAT the list of color buffers */
127 pipe
->ctx
->framebuffer
.state
.cbufs
[id
] = pipe
->ctx
->b
.b
.create_surface(
128 (struct pipe_context
*)pipe
->ctx
,
129 (struct pipe_resource
*)bo
, &rat_templ
);
131 /* Update the number of color buffers */
132 pipe
->ctx
->framebuffer
.state
.nr_cbufs
=
133 MAX2(id
+ 1, pipe
->ctx
->framebuffer
.state
.nr_cbufs
);
135 /* Update the cb_target_mask
136 * XXX: I think this is a potential spot for bugs once we start doing
137 * GL interop. cb_target_mask may be modified in the 3D sections
139 pipe
->ctx
->compute_cb_target_mask
|= (0xf << (id
* 4));
141 surf
= (struct r600_surface
*)pipe
->ctx
->framebuffer
.state
.cbufs
[id
];
142 evergreen_init_color_surface_rat(rctx
, surf
);
145 static void evergreen_cs_set_vertex_buffer(struct r600_context
*rctx
,
148 struct pipe_resource
*buffer
)
150 struct r600_vertexbuf_state
*state
= &rctx
->cs_vertex_buffer_state
;
151 struct pipe_vertex_buffer
*vb
= &state
->vb
[vb_index
];
153 vb
->buffer_offset
= offset
;
155 vb
->user_buffer
= NULL
;
157 /* The vertex instructions in the compute shaders use the texture cache,
158 * so we need to invalidate it. */
159 rctx
->b
.flags
|= R600_CONTEXT_INV_VERTEX_CACHE
;
160 state
->enabled_mask
|= 1 << vb_index
;
161 state
->dirty_mask
|= 1 << vb_index
;
162 r600_mark_atom_dirty(rctx
, &state
->atom
);
165 static void evergreen_cs_set_constant_buffer(struct r600_context
*rctx
,
169 struct pipe_resource
*buffer
)
171 struct pipe_constant_buffer cb
;
172 cb
.buffer_size
= size
;
173 cb
.buffer_offset
= offset
;
175 cb
.user_buffer
= NULL
;
177 rctx
->b
.b
.set_constant_buffer(&rctx
->b
.b
, PIPE_SHADER_COMPUTE
, cb_index
, &cb
);
180 static const struct u_resource_vtbl r600_global_buffer_vtbl
=
182 u_default_resource_get_handle
, /* get_handle */
183 r600_compute_global_buffer_destroy
, /* resource_destroy */
184 r600_compute_global_transfer_map
, /* transfer_map */
185 r600_compute_global_transfer_flush_region
,/* transfer_flush_region */
186 r600_compute_global_transfer_unmap
, /* transfer_unmap */
187 r600_compute_global_transfer_inline_write
/* transfer_inline_write */
190 /* We need to define these R600 registers here, because we can't include
191 * evergreend.h and r600d.h.
193 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
194 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
198 static void r600_shader_binary_read_config(const struct radeon_shader_binary
*binary
,
199 struct r600_bytecode
*bc
,
200 uint64_t symbol_offset
,
204 const unsigned char *config
=
205 radeon_shader_binary_config_start(binary
, symbol_offset
);
207 for (i
= 0; i
< binary
->config_size_per_symbol
; i
+= 8) {
209 util_le32_to_cpu(*(uint32_t*)(config
+ i
));
211 util_le32_to_cpu(*(uint32_t*)(config
+ i
+ 4));
214 case R_028850_SQ_PGM_RESOURCES_PS
:
215 case R_028868_SQ_PGM_RESOURCES_VS
:
216 /* Evergreen / Northern Islands */
217 case R_028844_SQ_PGM_RESOURCES_PS
:
218 case R_028860_SQ_PGM_RESOURCES_VS
:
219 case R_0288D4_SQ_PGM_RESOURCES_LS
:
220 bc
->ngpr
= MAX2(bc
->ngpr
, G_028844_NUM_GPRS(value
));
221 bc
->nstack
= MAX2(bc
->nstack
, G_028844_STACK_SIZE(value
));
223 case R_02880C_DB_SHADER_CONTROL
:
224 *use_kill
= G_02880C_KILL_ENABLE(value
);
226 case R_0288E8_SQ_LDS_ALLOC
:
233 static unsigned r600_create_shader(struct r600_bytecode
*bc
,
234 const struct radeon_shader_binary
*binary
,
238 assert(binary
->code_size
% 4 == 0);
239 bc
->bytecode
= CALLOC(1, binary
->code_size
);
240 memcpy(bc
->bytecode
, binary
->code
, binary
->code_size
);
241 bc
->ndw
= binary
->code_size
/ 4;
243 r600_shader_binary_read_config(binary
, bc
, 0, use_kill
);
249 static void r600_destroy_shader(struct r600_bytecode
*bc
)
254 void *evergreen_create_compute_state(struct pipe_context
*ctx
,
255 const const struct pipe_compute_state
*cso
)
257 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
258 struct r600_pipe_compute
*shader
= CALLOC_STRUCT(r600_pipe_compute
);
260 const struct pipe_llvm_program_header
*header
;
265 COMPUTE_DBG(rctx
->screen
, "*** evergreen_create_compute_state\n");
267 code
= cso
->prog
+ sizeof(struct pipe_llvm_program_header
);
268 radeon_shader_binary_init(&shader
->binary
);
269 radeon_elf_read(code
, header
->num_bytes
, &shader
->binary
);
270 r600_create_shader(&shader
->bc
, &shader
->binary
, &use_kill
);
272 shader
->code_bo
= r600_compute_buffer_alloc_vram(rctx
->screen
,
274 p
= r600_buffer_map_sync_with_rings(&rctx
->b
, shader
->code_bo
, PIPE_TRANSFER_WRITE
);
275 memcpy(p
, shader
->bc
.bytecode
, shader
->bc
.ndw
* 4);
276 rctx
->b
.ws
->buffer_unmap(shader
->code_bo
->buf
);
280 shader
->local_size
= cso
->req_local_mem
;
281 shader
->private_size
= cso
->req_private_mem
;
282 shader
->input_size
= cso
->req_input_mem
;
287 void evergreen_delete_compute_state(struct pipe_context
*ctx
, void *state
)
289 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
290 struct r600_pipe_compute
*shader
= state
;
292 COMPUTE_DBG(rctx
->screen
, "*** evergreen_delete_compute_state\n");
297 radeon_shader_binary_clean(&shader
->binary
);
298 r600_destroy_shader(&shader
->bc
);
300 /* TODO destroy shader->code_bo, shader->const_bo
301 * we'll need something like r600_buffer_free */
305 static void evergreen_bind_compute_state(struct pipe_context
*ctx
, void *state
)
307 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
309 COMPUTE_DBG(rctx
->screen
, "*** evergreen_bind_compute_state\n");
311 rctx
->cs_shader_state
.shader
= (struct r600_pipe_compute
*)state
;
314 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
315 * kernel parameters there are implicit parameters that need to be stored
316 * in the vertex buffer as well. Here is how these parameters are organized in
319 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
320 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
321 * DWORDS 6-8: Number of work items within each work group in each dimension
323 * DWORDS 9+ : Kernel parameters
325 static void evergreen_compute_upload_input(struct pipe_context
*ctx
,
326 const struct pipe_grid_info
*info
)
328 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
329 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
331 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
334 unsigned input_size
= shader
->input_size
+ 36;
335 uint32_t *num_work_groups_start
;
336 uint32_t *global_size_start
;
337 uint32_t *local_size_start
;
338 uint32_t *kernel_parameters_start
;
340 struct pipe_transfer
*transfer
= NULL
;
342 if (shader
->input_size
== 0) {
346 if (!shader
->kernel_param
) {
347 /* Add space for the grid dimensions */
348 shader
->kernel_param
= (struct r600_resource
*)
349 pipe_buffer_create(ctx
->screen
, PIPE_BIND_CUSTOM
,
350 PIPE_USAGE_IMMUTABLE
, input_size
);
353 u_box_1d(0, input_size
, &box
);
354 num_work_groups_start
= ctx
->transfer_map(ctx
,
355 (struct pipe_resource
*)shader
->kernel_param
,
356 0, PIPE_TRANSFER_WRITE
| PIPE_TRANSFER_DISCARD_RANGE
,
358 global_size_start
= num_work_groups_start
+ (3 * (sizeof(uint
) /4));
359 local_size_start
= global_size_start
+ (3 * (sizeof(uint
)) / 4);
360 kernel_parameters_start
= local_size_start
+ (3 * (sizeof(uint
)) / 4);
362 /* Copy the work group size */
363 memcpy(num_work_groups_start
, info
->grid
, 3 * sizeof(uint
));
365 /* Copy the global size */
366 for (i
= 0; i
< 3; i
++) {
367 global_size_start
[i
] = info
->grid
[i
] * info
->block
[i
];
370 /* Copy the local dimensions */
371 memcpy(local_size_start
, info
->block
, 3 * sizeof(uint
));
373 /* Copy the kernel inputs */
374 memcpy(kernel_parameters_start
, info
->input
, shader
->input_size
);
376 for (i
= 0; i
< (input_size
/ 4); i
++) {
377 COMPUTE_DBG(rctx
->screen
, "input %i : %u\n", i
,
378 ((unsigned*)num_work_groups_start
)[i
]);
381 ctx
->transfer_unmap(ctx
, transfer
);
383 /* ID=0 is reserved for the parameters */
384 evergreen_cs_set_constant_buffer(rctx
, 0, 0, input_size
,
385 (struct pipe_resource
*)shader
->kernel_param
);
388 static void evergreen_emit_dispatch(struct r600_context
*rctx
,
389 const struct pipe_grid_info
*info
)
392 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
393 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
395 unsigned num_pipes
= rctx
->screen
->b
.info
.r600_max_quad_pipes
;
396 unsigned wave_divisor
= (16 * num_pipes
);
399 unsigned lds_size
= shader
->local_size
/ 4 +
403 /* Calculate group_size/grid_size */
404 for (i
= 0; i
< 3; i
++) {
405 group_size
*= info
->block
[i
];
408 for (i
= 0; i
< 3; i
++) {
409 grid_size
*= info
->grid
[i
];
412 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
413 num_waves
= (info
->block
[0] * info
->block
[1] * info
->block
[2] +
414 wave_divisor
- 1) / wave_divisor
;
416 COMPUTE_DBG(rctx
->screen
, "Using %u pipes, "
417 "%u wavefronts per thread block, "
418 "allocating %u dwords lds.\n",
419 num_pipes
, num_waves
, lds_size
);
421 radeon_set_config_reg(cs
, R_008970_VGT_NUM_INDICES
, group_size
);
423 radeon_set_config_reg_seq(cs
, R_00899C_VGT_COMPUTE_START_X
, 3);
424 radeon_emit(cs
, 0); /* R_00899C_VGT_COMPUTE_START_X */
425 radeon_emit(cs
, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
426 radeon_emit(cs
, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
428 radeon_set_config_reg(cs
, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE
,
431 radeon_compute_set_context_reg_seq(cs
, R_0286EC_SPI_COMPUTE_NUM_THREAD_X
, 3);
432 radeon_emit(cs
, info
->block
[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
433 radeon_emit(cs
, info
->block
[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
434 radeon_emit(cs
, info
->block
[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
436 if (rctx
->b
.chip_class
< CAYMAN
) {
437 assert(lds_size
<= 8192);
439 /* Cayman appears to have a slightly smaller limit, see the
440 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
441 assert(lds_size
<= 8160);
444 radeon_compute_set_context_reg(cs
, R_0288E8_SQ_LDS_ALLOC
,
445 lds_size
| (num_waves
<< 14));
447 /* Dispatch packet */
448 radeon_emit(cs
, PKT3C(PKT3_DISPATCH_DIRECT
, 3, 0));
449 radeon_emit(cs
, info
->grid
[0]);
450 radeon_emit(cs
, info
->grid
[1]);
451 radeon_emit(cs
, info
->grid
[2]);
452 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
456 static void compute_emit_cs(struct r600_context
*rctx
,
457 const struct pipe_grid_info
*info
)
459 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
462 /* make sure that the gfx ring is only one active */
463 if (rctx
->b
.dma
.cs
&& rctx
->b
.dma
.cs
->cdw
) {
464 rctx
->b
.dma
.flush(rctx
, RADEON_FLUSH_ASYNC
, NULL
);
467 /* Initialize all the compute-related registers.
469 * See evergreen_init_atom_start_compute_cs() in this file for the list
470 * of registers initialized by the start_compute_cs_cmd atom.
472 r600_emit_command_buffer(cs
, &rctx
->start_compute_cs_cmd
);
474 /* emit config state */
475 if (rctx
->b
.chip_class
== EVERGREEN
)
476 r600_emit_atom(rctx
, &rctx
->config_state
.atom
);
478 rctx
->b
.flags
|= R600_CONTEXT_WAIT_3D_IDLE
| R600_CONTEXT_FLUSH_AND_INV
;
479 r600_flush_emit(rctx
);
481 /* Emit colorbuffers. */
482 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
483 for (i
= 0; i
< 8 && i
< rctx
->framebuffer
.state
.nr_cbufs
; i
++) {
484 struct r600_surface
*cb
= (struct r600_surface
*)rctx
->framebuffer
.state
.cbufs
[i
];
485 unsigned reloc
= radeon_add_to_buffer_list(&rctx
->b
, &rctx
->b
.gfx
,
486 (struct r600_resource
*)cb
->base
.texture
,
487 RADEON_USAGE_READWRITE
,
488 RADEON_PRIO_SHADER_RW_BUFFER
);
490 radeon_compute_set_context_reg_seq(cs
, R_028C60_CB_COLOR0_BASE
+ i
* 0x3C, 7);
491 radeon_emit(cs
, cb
->cb_color_base
); /* R_028C60_CB_COLOR0_BASE */
492 radeon_emit(cs
, cb
->cb_color_pitch
); /* R_028C64_CB_COLOR0_PITCH */
493 radeon_emit(cs
, cb
->cb_color_slice
); /* R_028C68_CB_COLOR0_SLICE */
494 radeon_emit(cs
, cb
->cb_color_view
); /* R_028C6C_CB_COLOR0_VIEW */
495 radeon_emit(cs
, cb
->cb_color_info
); /* R_028C70_CB_COLOR0_INFO */
496 radeon_emit(cs
, cb
->cb_color_attrib
); /* R_028C74_CB_COLOR0_ATTRIB */
497 radeon_emit(cs
, cb
->cb_color_dim
); /* R_028C78_CB_COLOR0_DIM */
499 radeon_emit(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
500 radeon_emit(cs
, reloc
);
502 radeon_emit(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
503 radeon_emit(cs
, reloc
);
506 radeon_compute_set_context_reg(cs
, R_028C70_CB_COLOR0_INFO
+ i
* 0x3C,
507 S_028C70_FORMAT(V_028C70_COLOR_INVALID
));
509 radeon_compute_set_context_reg(cs
, R_028E50_CB_COLOR8_INFO
+ (i
- 8) * 0x1C,
510 S_028C70_FORMAT(V_028C70_COLOR_INVALID
));
512 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
513 radeon_compute_set_context_reg(cs
, R_028238_CB_TARGET_MASK
,
514 rctx
->compute_cb_target_mask
);
517 /* Emit vertex buffer state */
518 rctx
->cs_vertex_buffer_state
.atom
.num_dw
= 12 * util_bitcount(rctx
->cs_vertex_buffer_state
.dirty_mask
);
519 r600_emit_atom(rctx
, &rctx
->cs_vertex_buffer_state
.atom
);
521 /* Emit constant buffer state */
522 r600_emit_atom(rctx
, &rctx
->constbuf_state
[PIPE_SHADER_COMPUTE
].atom
);
524 /* Emit sampler state */
525 r600_emit_atom(rctx
, &rctx
->samplers
[PIPE_SHADER_COMPUTE
].states
.atom
);
527 /* Emit sampler view (texture resource) state */
528 r600_emit_atom(rctx
, &rctx
->samplers
[PIPE_SHADER_COMPUTE
].views
.atom
);
530 /* Emit compute shader state */
531 r600_emit_atom(rctx
, &rctx
->cs_shader_state
.atom
);
533 /* Emit dispatch state and dispatch packet */
534 evergreen_emit_dispatch(rctx
, info
);
536 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
538 rctx
->b
.flags
|= R600_CONTEXT_INV_CONST_CACHE
|
539 R600_CONTEXT_INV_VERTEX_CACHE
|
540 R600_CONTEXT_INV_TEX_CACHE
;
541 r600_flush_emit(rctx
);
544 if (rctx
->b
.chip_class
>= CAYMAN
) {
545 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 0, 0);
546 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH
) | EVENT_INDEX(4);
547 /* DEALLOC_STATE prevents the GPU from hanging when a
548 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
549 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
551 cs
->buf
[cs
->cdw
++] = PKT3C(PKT3_DEALLOC_STATE
, 0, 0);
552 cs
->buf
[cs
->cdw
++] = 0;
556 COMPUTE_DBG(rctx
->screen
, "cdw: %i\n", cs
->cdw
);
557 for (i
= 0; i
< cs
->cdw
; i
++) {
558 COMPUTE_DBG(rctx
->screen
, "%4i : 0x%08X\n", i
, cs
->buf
[i
]);
566 * Emit function for r600_cs_shader_state atom
568 void evergreen_emit_cs_shader(struct r600_context
*rctx
,
569 struct r600_atom
*atom
)
571 struct r600_cs_shader_state
*state
=
572 (struct r600_cs_shader_state
*)atom
;
573 struct r600_pipe_compute
*shader
= state
->shader
;
574 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
576 struct r600_resource
*code_bo
;
577 unsigned ngpr
, nstack
;
579 code_bo
= shader
->code_bo
;
580 va
= shader
->code_bo
->gpu_address
+ state
->pc
;
581 ngpr
= shader
->bc
.ngpr
;
582 nstack
= shader
->bc
.nstack
;
584 radeon_compute_set_context_reg_seq(cs
, R_0288D0_SQ_PGM_START_LS
, 3);
585 radeon_emit(cs
, va
>> 8); /* R_0288D0_SQ_PGM_START_LS */
586 radeon_emit(cs
, /* R_0288D4_SQ_PGM_RESOURCES_LS */
587 S_0288D4_NUM_GPRS(ngpr
)
588 | S_0288D4_STACK_SIZE(nstack
));
589 radeon_emit(cs
, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
591 radeon_emit(cs
, PKT3C(PKT3_NOP
, 0, 0));
592 radeon_emit(cs
, radeon_add_to_buffer_list(&rctx
->b
, &rctx
->b
.gfx
,
593 code_bo
, RADEON_USAGE_READ
,
594 RADEON_PRIO_USER_SHADER
));
597 static void evergreen_launch_grid(struct pipe_context
*ctx
,
598 const struct pipe_grid_info
*info
)
600 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
602 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
605 rctx
->cs_shader_state
.pc
= info
->pc
;
606 /* Get the config information for this kernel. */
607 r600_shader_binary_read_config(&shader
->binary
, &shader
->bc
,
608 info
->pc
, &use_kill
);
611 COMPUTE_DBG(rctx
->screen
, "*** evergreen_launch_grid: pc = %u\n", info
->pc
);
614 evergreen_compute_upload_input(ctx
, info
);
615 compute_emit_cs(rctx
, info
);
618 static void evergreen_set_compute_resources(struct pipe_context
*ctx
,
619 unsigned start
, unsigned count
,
620 struct pipe_surface
**surfaces
)
622 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
623 struct r600_surface
**resources
= (struct r600_surface
**)surfaces
;
625 COMPUTE_DBG(rctx
->screen
, "*** evergreen_set_compute_resources: start = %u count = %u\n",
628 for (unsigned i
= 0; i
< count
; i
++) {
629 /* The First two vertex buffers are reserved for parameters and
631 unsigned vtx_id
= 2 + i
;
633 struct r600_resource_global
*buffer
=
634 (struct r600_resource_global
*)
635 resources
[i
]->base
.texture
;
636 if (resources
[i
]->base
.writable
) {
639 evergreen_set_rat(rctx
->cs_shader_state
.shader
, i
+1,
640 (struct r600_resource
*)resources
[i
]->base
.texture
,
641 buffer
->chunk
->start_in_dw
*4,
642 resources
[i
]->base
.texture
->width0
);
645 evergreen_cs_set_vertex_buffer(rctx
, vtx_id
,
646 buffer
->chunk
->start_in_dw
* 4,
647 resources
[i
]->base
.texture
);
652 static void evergreen_set_global_binding(struct pipe_context
*ctx
,
653 unsigned first
, unsigned n
,
654 struct pipe_resource
**resources
,
657 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
658 struct compute_memory_pool
*pool
= rctx
->screen
->global_pool
;
659 struct r600_resource_global
**buffers
=
660 (struct r600_resource_global
**)resources
;
663 COMPUTE_DBG(rctx
->screen
, "*** evergreen_set_global_binding first = %u n = %u\n",
671 /* We mark these items for promotion to the pool if they
672 * aren't already there */
673 for (i
= first
; i
< first
+ n
; i
++) {
674 struct compute_memory_item
*item
= buffers
[i
]->chunk
;
676 if (!is_item_in_pool(item
))
677 buffers
[i
]->chunk
->status
|= ITEM_FOR_PROMOTING
;
680 if (compute_memory_finalize_pending(pool
, ctx
) == -1) {
685 for (i
= first
; i
< first
+ n
; i
++)
687 uint32_t buffer_offset
;
689 assert(resources
[i
]->target
== PIPE_BUFFER
);
690 assert(resources
[i
]->bind
& PIPE_BIND_GLOBAL
);
692 buffer_offset
= util_le32_to_cpu(*(handles
[i
]));
693 handle
= buffer_offset
+ buffers
[i
]->chunk
->start_in_dw
* 4;
695 *(handles
[i
]) = util_cpu_to_le32(handle
);
698 evergreen_set_rat(rctx
->cs_shader_state
.shader
, 0, pool
->bo
, 0, pool
->size_in_dw
* 4);
699 evergreen_cs_set_vertex_buffer(rctx
, 1, 0,
700 (struct pipe_resource
*)pool
->bo
);
704 * This function initializes all the compute specific registers that need to
705 * be initialized for each compute command stream. Registers that are common
706 * to both compute and 3D will be initialized at the beginning of each compute
707 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
708 * packet requires that the shader type bit be set, we must initialize all
709 * context registers needed for compute in this function. The registers
710 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
711 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
714 void evergreen_init_atom_start_compute_cs(struct r600_context
*rctx
)
716 struct r600_command_buffer
*cb
= &rctx
->start_compute_cs_cmd
;
718 int num_stack_entries
;
720 /* since all required registers are initialized in the
721 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
723 r600_init_command_buffer(cb
, 256);
724 cb
->pkt_flags
= RADEON_CP_PACKET3_COMPUTE_MODE
;
726 /* This must be first. */
727 r600_store_value(cb
, PKT3(PKT3_CONTEXT_CONTROL
, 1, 0));
728 r600_store_value(cb
, 0x80000000);
729 r600_store_value(cb
, 0x80000000);
731 /* We're setting config registers here. */
732 r600_store_value(cb
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
733 r600_store_value(cb
, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH
) | EVENT_INDEX(4));
735 switch (rctx
->b
.family
) {
739 num_stack_entries
= 256;
743 num_stack_entries
= 256;
747 num_stack_entries
= 512;
752 num_stack_entries
= 512;
756 num_stack_entries
= 256;
760 num_stack_entries
= 256;
764 num_stack_entries
= 512;
768 num_stack_entries
= 512;
772 num_stack_entries
= 256;
776 num_stack_entries
= 256;
780 /* Config Registers */
781 if (rctx
->b
.chip_class
< CAYMAN
)
782 evergreen_init_common_regs(rctx
, cb
, rctx
->b
.chip_class
, rctx
->b
.family
,
783 rctx
->screen
->b
.info
.drm_minor
);
785 cayman_init_common_regs(cb
, rctx
->b
.chip_class
, rctx
->b
.family
,
786 rctx
->screen
->b
.info
.drm_minor
);
788 /* The primitive type always needs to be POINTLIST for compute. */
789 r600_store_config_reg(cb
, R_008958_VGT_PRIMITIVE_TYPE
,
790 V_008958_DI_PT_POINTLIST
);
792 if (rctx
->b
.chip_class
< CAYMAN
) {
794 /* These registers control which simds can be used by each stage.
795 * The default for these registers is 0xffffffff, which means
796 * all simds are available for each stage. It's possible we may
797 * want to play around with these in the future, but for now
798 * the default value is fine.
800 * R_008E20_SQ_STATIC_THREAD_MGMT1
801 * R_008E24_SQ_STATIC_THREAD_MGMT2
802 * R_008E28_SQ_STATIC_THREAD_MGMT3
805 /* XXX: We may need to adjust the thread and stack resource
806 * values for 3D/compute interop */
808 r600_store_config_reg_seq(cb
, R_008C18_SQ_THREAD_RESOURCE_MGMT_1
, 5);
810 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
811 * Set the number of threads used by the PS/VS/GS/ES stage to
814 r600_store_value(cb
, 0);
816 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
817 * Set the number of threads used by the CS (aka LS) stage to
818 * the maximum number of threads and set the number of threads
819 * for the HS stage to 0. */
820 r600_store_value(cb
, S_008C1C_NUM_LS_THREADS(num_threads
));
822 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
823 * Set the Control Flow stack entries to 0 for PS/VS stages */
824 r600_store_value(cb
, 0);
826 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
827 * Set the Control Flow stack entries to 0 for GS/ES stages */
828 r600_store_value(cb
, 0);
830 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
831 * Set the Contol Flow stack entries to 0 for the HS stage, and
832 * set it to the maximum value for the CS (aka LS) stage. */
834 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries
));
836 /* Give the compute shader all the available LDS space.
837 * NOTE: This only sets the maximum number of dwords that a compute
838 * shader can allocate. When a shader is executed, we still need to
839 * allocate the appropriate amount of LDS dwords using the
840 * CM_R_0288E8_SQ_LDS_ALLOC register.
842 if (rctx
->b
.chip_class
< CAYMAN
) {
843 r600_store_config_reg(cb
, R_008E2C_SQ_LDS_RESOURCE_MGMT
,
844 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
846 r600_store_context_reg(cb
, CM_R_0286FC_SPI_LDS_MGMT
,
847 S_0286FC_NUM_PS_LDS(0) |
848 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
851 /* Context Registers */
853 if (rctx
->b
.chip_class
< CAYMAN
) {
854 /* workaround for hw issues with dyn gpr - must set all limits
855 * to 240 instead of 0, 0x1e == 240 / 8
857 r600_store_context_reg(cb
, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1
,
858 S_028838_PS_GPRS(0x1e) |
859 S_028838_VS_GPRS(0x1e) |
860 S_028838_GS_GPRS(0x1e) |
861 S_028838_ES_GPRS(0x1e) |
862 S_028838_HS_GPRS(0x1e) |
863 S_028838_LS_GPRS(0x1e));
866 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
867 r600_store_context_reg(cb
, R_028A40_VGT_GS_MODE
,
868 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
870 r600_store_context_reg(cb
, R_028B54_VGT_SHADER_STAGES_EN
, 2/*CS_ON*/);
872 r600_store_context_reg(cb
, R_0286E8_SPI_COMPUTE_INPUT_CNTL
,
873 S_0286E8_TID_IN_GROUP_ENA
875 | S_0286E8_DISABLE_INDEX_PACK
)
878 /* The LOOP_CONST registers are an optimizations for loops that allows
879 * you to store the initial counter, increment value, and maximum
880 * counter value in a register so that hardware can calculate the
881 * correct number of iterations for the loop, so that you don't need
882 * to have the loop counter in your shader code. We don't currently use
883 * this optimization, so we must keep track of the counter in the
884 * shader and use a break instruction to exit loops. However, the
885 * hardware will still uses this register to determine when to exit a
886 * loop, so we need to initialize the counter to 0, set the increment
887 * value to 1 and the maximum counter value to the 4095 (0xfff) which
888 * is the maximum value allowed. This gives us a maximum of 4096
889 * iterations for our loops, but hopefully our break instruction will
890 * execute before some time before the 4096th iteration.
892 eg_store_loop_const(cb
, R_03A200_SQ_LOOP_CONST_0
+ (160 * 4), 0x1000FFF);
895 void evergreen_init_compute_state_functions(struct r600_context
*rctx
)
897 rctx
->b
.b
.create_compute_state
= evergreen_create_compute_state
;
898 rctx
->b
.b
.delete_compute_state
= evergreen_delete_compute_state
;
899 rctx
->b
.b
.bind_compute_state
= evergreen_bind_compute_state
;
900 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
901 rctx
->b
.b
.set_compute_resources
= evergreen_set_compute_resources
;
902 rctx
->b
.b
.set_global_binding
= evergreen_set_global_binding
;
903 rctx
->b
.b
.launch_grid
= evergreen_launch_grid
;
907 struct pipe_resource
*r600_compute_global_buffer_create(struct pipe_screen
*screen
,
908 const struct pipe_resource
*templ
)
910 struct r600_resource_global
* result
= NULL
;
911 struct r600_screen
* rscreen
= NULL
;
914 assert(templ
->target
== PIPE_BUFFER
);
915 assert(templ
->bind
& PIPE_BIND_GLOBAL
);
916 assert(templ
->array_size
== 1 || templ
->array_size
== 0);
917 assert(templ
->depth0
== 1 || templ
->depth0
== 0);
918 assert(templ
->height0
== 1 || templ
->height0
== 0);
920 result
= (struct r600_resource_global
*)
921 CALLOC(sizeof(struct r600_resource_global
), 1);
922 rscreen
= (struct r600_screen
*)screen
;
924 COMPUTE_DBG(rscreen
, "*** r600_compute_global_buffer_create\n");
925 COMPUTE_DBG(rscreen
, "width = %u array_size = %u\n", templ
->width0
,
928 result
->base
.b
.vtbl
= &r600_global_buffer_vtbl
;
929 result
->base
.b
.b
= *templ
;
930 result
->base
.b
.b
.screen
= screen
;
931 pipe_reference_init(&result
->base
.b
.b
.reference
, 1);
933 size_in_dw
= (templ
->width0
+3) / 4;
935 result
->chunk
= compute_memory_alloc(rscreen
->global_pool
, size_in_dw
);
937 if (result
->chunk
== NULL
)
943 return &result
->base
.b
.b
;
946 void r600_compute_global_buffer_destroy(struct pipe_screen
*screen
,
947 struct pipe_resource
*res
)
949 struct r600_resource_global
* buffer
= NULL
;
950 struct r600_screen
* rscreen
= NULL
;
952 assert(res
->target
== PIPE_BUFFER
);
953 assert(res
->bind
& PIPE_BIND_GLOBAL
);
955 buffer
= (struct r600_resource_global
*)res
;
956 rscreen
= (struct r600_screen
*)screen
;
958 compute_memory_free(rscreen
->global_pool
, buffer
->chunk
->id
);
960 buffer
->chunk
= NULL
;
964 void *r600_compute_global_transfer_map(struct pipe_context
*ctx
,
965 struct pipe_resource
*resource
,
968 const struct pipe_box
*box
,
969 struct pipe_transfer
**ptransfer
)
971 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
972 struct compute_memory_pool
*pool
= rctx
->screen
->global_pool
;
973 struct r600_resource_global
* buffer
=
974 (struct r600_resource_global
*)resource
;
976 struct compute_memory_item
*item
= buffer
->chunk
;
977 struct pipe_resource
*dst
= NULL
;
978 unsigned offset
= box
->x
;
980 if (is_item_in_pool(item
)) {
981 compute_memory_demote_item(pool
, item
, ctx
);
984 if (item
->real_buffer
== NULL
) {
986 r600_compute_buffer_alloc_vram(pool
->screen
, item
->size_in_dw
* 4);
990 dst
= (struct pipe_resource
*)item
->real_buffer
;
992 if (usage
& PIPE_TRANSFER_READ
)
993 buffer
->chunk
->status
|= ITEM_MAPPED_FOR_READING
;
995 COMPUTE_DBG(rctx
->screen
, "* r600_compute_global_transfer_map()\n"
996 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
997 "width = %u, height = %u, depth = %u)\n", level
, usage
,
998 box
->x
, box
->y
, box
->z
, box
->width
, box
->height
,
1000 COMPUTE_DBG(rctx
->screen
, "Buffer id = %"PRIi64
" offset = "
1001 "%u (box.x)\n", item
->id
, box
->x
);
1004 assert(resource
->target
== PIPE_BUFFER
);
1005 assert(resource
->bind
& PIPE_BIND_GLOBAL
);
1006 assert(box
->x
>= 0);
1007 assert(box
->y
== 0);
1008 assert(box
->z
== 0);
1010 ///TODO: do it better, mapping is not possible if the pool is too big
1011 return pipe_buffer_map_range(ctx
, dst
,
1012 offset
, box
->width
, usage
, ptransfer
);
1015 void r600_compute_global_transfer_unmap(struct pipe_context
*ctx
,
1016 struct pipe_transfer
*transfer
)
1018 /* struct r600_resource_global are not real resources, they just map
1019 * to an offset within the compute memory pool. The function
1020 * r600_compute_global_transfer_map() maps the memory pool
1021 * resource rather than the struct r600_resource_global passed to
1022 * it as an argument and then initalizes ptransfer->resource with
1023 * the memory pool resource (via pipe_buffer_map_range).
1024 * When transfer_unmap is called it uses the memory pool's
1025 * vtable which calls r600_buffer_transfer_map() rather than
1028 assert (!"This function should not be called");
1031 void r600_compute_global_transfer_flush_region(struct pipe_context
*ctx
,
1032 struct pipe_transfer
*transfer
,
1033 const struct pipe_box
*box
)
1035 assert(0 && "TODO");
1038 void r600_compute_global_transfer_inline_write(struct pipe_context
*pipe
,
1039 struct pipe_resource
*resource
,
1042 const struct pipe_box
*box
,
1045 unsigned layer_stride
)
1047 assert(0 && "TODO");