r600: make two compute functions static.
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #include <stdio.h>
28 #include <errno.h>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "evergreend.h"
42 #include "r600_shader.h"
43 #include "r600_pipe.h"
44 #include "r600_formats.h"
45 #include "evergreen_compute.h"
46 #include "evergreen_compute_internal.h"
47 #include "compute_memory_pool.h"
48 #include "sb/sb_public.h"
49 #ifdef HAVE_OPENCL
50 #include "radeon/radeon_llvm_util.h"
51 #endif
52 #include "radeon/radeon_elf_util.h"
53 #include <inttypes.h>
54
55 /**
56 RAT0 is for global binding write
57 VTX1 is for global binding read
58
59 for wrting images RAT1...
60 for reading images TEX2...
61 TEX2-RAT1 is paired
62
63 TEX2... consumes the same fetch resources, that VTX2... would consume
64
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
67 also constant cached
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
70
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
74
75 from Nvidia OpenCL:
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
78
79 so 10 for writing is enough. 176 is the max for reading according to the docs
80
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
83
84 */
85
86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
87 unsigned size)
88 {
89 struct pipe_resource *buffer = NULL;
90 assert(size);
91
92 buffer = pipe_buffer_create((struct pipe_screen*) screen,
93 PIPE_BIND_CUSTOM,
94 PIPE_USAGE_IMMUTABLE,
95 size);
96
97 return (struct r600_resource *)buffer;
98 }
99
100
101 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
102 unsigned id,
103 struct r600_resource *bo,
104 int start,
105 int size)
106 {
107 struct pipe_surface rat_templ;
108 struct r600_surface *surf = NULL;
109 struct r600_context *rctx = NULL;
110
111 assert(id < 12);
112 assert((size & 3) == 0);
113 assert((start & 0xFF) == 0);
114
115 rctx = pipe->ctx;
116
117 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
118
119 /* Create the RAT surface */
120 memset(&rat_templ, 0, sizeof(rat_templ));
121 rat_templ.format = PIPE_FORMAT_R32_UINT;
122 rat_templ.u.tex.level = 0;
123 rat_templ.u.tex.first_layer = 0;
124 rat_templ.u.tex.last_layer = 0;
125
126 /* Add the RAT the list of color buffers */
127 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
128 (struct pipe_context *)pipe->ctx,
129 (struct pipe_resource *)bo, &rat_templ);
130
131 /* Update the number of color buffers */
132 pipe->ctx->framebuffer.state.nr_cbufs =
133 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
134
135 /* Update the cb_target_mask
136 * XXX: I think this is a potential spot for bugs once we start doing
137 * GL interop. cb_target_mask may be modified in the 3D sections
138 * of this driver. */
139 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
140
141 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
142 evergreen_init_color_surface_rat(rctx, surf);
143 }
144
145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
146 unsigned vb_index,
147 unsigned offset,
148 struct pipe_resource *buffer)
149 {
150 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
151 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
152 vb->stride = 1;
153 vb->buffer_offset = offset;
154 vb->buffer = buffer;
155 vb->user_buffer = NULL;
156
157 /* The vertex instructions in the compute shaders use the texture cache,
158 * so we need to invalidate it. */
159 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
160 state->enabled_mask |= 1 << vb_index;
161 state->dirty_mask |= 1 << vb_index;
162 r600_mark_atom_dirty(rctx, &state->atom);
163 }
164
165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
166 unsigned cb_index,
167 unsigned offset,
168 unsigned size,
169 struct pipe_resource *buffer)
170 {
171 struct pipe_constant_buffer cb;
172 cb.buffer_size = size;
173 cb.buffer_offset = offset;
174 cb.buffer = buffer;
175 cb.user_buffer = NULL;
176
177 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
178 }
179
180 static const struct u_resource_vtbl r600_global_buffer_vtbl =
181 {
182 u_default_resource_get_handle, /* get_handle */
183 r600_compute_global_buffer_destroy, /* resource_destroy */
184 r600_compute_global_transfer_map, /* transfer_map */
185 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
186 r600_compute_global_transfer_unmap, /* transfer_unmap */
187 r600_compute_global_transfer_inline_write /* transfer_inline_write */
188 };
189
190 /* We need to define these R600 registers here, because we can't include
191 * evergreend.h and r600d.h.
192 */
193 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
194 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
195
196 #ifdef HAVE_OPENCL
197
198 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
199 struct r600_bytecode *bc,
200 uint64_t symbol_offset,
201 boolean *use_kill)
202 {
203 unsigned i;
204 const unsigned char *config =
205 radeon_shader_binary_config_start(binary, symbol_offset);
206
207 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
208 unsigned reg =
209 util_le32_to_cpu(*(uint32_t*)(config + i));
210 unsigned value =
211 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
212 switch (reg) {
213 /* R600 / R700 */
214 case R_028850_SQ_PGM_RESOURCES_PS:
215 case R_028868_SQ_PGM_RESOURCES_VS:
216 /* Evergreen / Northern Islands */
217 case R_028844_SQ_PGM_RESOURCES_PS:
218 case R_028860_SQ_PGM_RESOURCES_VS:
219 case R_0288D4_SQ_PGM_RESOURCES_LS:
220 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
221 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
222 break;
223 case R_02880C_DB_SHADER_CONTROL:
224 *use_kill = G_02880C_KILL_ENABLE(value);
225 break;
226 case R_0288E8_SQ_LDS_ALLOC:
227 bc->nlds_dw = value;
228 break;
229 }
230 }
231 }
232
233 static unsigned r600_create_shader(struct r600_bytecode *bc,
234 const struct radeon_shader_binary *binary,
235 boolean *use_kill)
236
237 {
238 assert(binary->code_size % 4 == 0);
239 bc->bytecode = CALLOC(1, binary->code_size);
240 memcpy(bc->bytecode, binary->code, binary->code_size);
241 bc->ndw = binary->code_size / 4;
242
243 r600_shader_binary_read_config(binary, bc, 0, use_kill);
244 return 0;
245 }
246
247 #endif
248
249 static void r600_destroy_shader(struct r600_bytecode *bc)
250 {
251 FREE(bc->bytecode);
252 }
253
254 static void *evergreen_create_compute_state(struct pipe_context *ctx,
255 const const struct pipe_compute_state *cso)
256 {
257 struct r600_context *rctx = (struct r600_context *)ctx;
258 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
259 #ifdef HAVE_OPENCL
260 const struct pipe_llvm_program_header *header;
261 const char *code;
262 void *p;
263 boolean use_kill;
264
265 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
266 header = cso->prog;
267 code = cso->prog + sizeof(struct pipe_llvm_program_header);
268 radeon_shader_binary_init(&shader->binary);
269 radeon_elf_read(code, header->num_bytes, &shader->binary);
270 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
271
272 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
273 shader->bc.ndw * 4);
274 p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
275 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
276 rctx->b.ws->buffer_unmap(shader->code_bo->buf);
277 #endif
278
279 shader->ctx = rctx;
280 shader->local_size = cso->req_local_mem;
281 shader->private_size = cso->req_private_mem;
282 shader->input_size = cso->req_input_mem;
283
284 return shader;
285 }
286
287 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
288 {
289 struct r600_context *rctx = (struct r600_context *)ctx;
290 struct r600_pipe_compute *shader = state;
291
292 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
293
294 if (!shader)
295 return;
296
297 radeon_shader_binary_clean(&shader->binary);
298 r600_destroy_shader(&shader->bc);
299
300 /* TODO destroy shader->code_bo, shader->const_bo
301 * we'll need something like r600_buffer_free */
302 FREE(shader);
303 }
304
305 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
306 {
307 struct r600_context *rctx = (struct r600_context *)ctx;
308
309 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
310
311 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
312 }
313
314 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
315 * kernel parameters there are implicit parameters that need to be stored
316 * in the vertex buffer as well. Here is how these parameters are organized in
317 * the buffer:
318 *
319 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
320 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
321 * DWORDS 6-8: Number of work items within each work group in each dimension
322 * (x,y,z)
323 * DWORDS 9+ : Kernel parameters
324 */
325 static void evergreen_compute_upload_input(struct pipe_context *ctx,
326 const struct pipe_grid_info *info)
327 {
328 struct r600_context *rctx = (struct r600_context *)ctx;
329 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
330 unsigned i;
331 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
332 * parameters.
333 */
334 unsigned input_size = shader->input_size + 36;
335 uint32_t *num_work_groups_start;
336 uint32_t *global_size_start;
337 uint32_t *local_size_start;
338 uint32_t *kernel_parameters_start;
339 struct pipe_box box;
340 struct pipe_transfer *transfer = NULL;
341
342 if (shader->input_size == 0) {
343 return;
344 }
345
346 if (!shader->kernel_param) {
347 /* Add space for the grid dimensions */
348 shader->kernel_param = (struct r600_resource *)
349 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
350 PIPE_USAGE_IMMUTABLE, input_size);
351 }
352
353 u_box_1d(0, input_size, &box);
354 num_work_groups_start = ctx->transfer_map(ctx,
355 (struct pipe_resource*)shader->kernel_param,
356 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
357 &box, &transfer);
358 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
359 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
360 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
361
362 /* Copy the work group size */
363 memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
364
365 /* Copy the global size */
366 for (i = 0; i < 3; i++) {
367 global_size_start[i] = info->grid[i] * info->block[i];
368 }
369
370 /* Copy the local dimensions */
371 memcpy(local_size_start, info->block, 3 * sizeof(uint));
372
373 /* Copy the kernel inputs */
374 memcpy(kernel_parameters_start, info->input, shader->input_size);
375
376 for (i = 0; i < (input_size / 4); i++) {
377 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
378 ((unsigned*)num_work_groups_start)[i]);
379 }
380
381 ctx->transfer_unmap(ctx, transfer);
382
383 /* ID=0 is reserved for the parameters */
384 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
385 (struct pipe_resource*)shader->kernel_param);
386 }
387
388 static void evergreen_emit_dispatch(struct r600_context *rctx,
389 const struct pipe_grid_info *info)
390 {
391 int i;
392 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
393 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
394 unsigned num_waves;
395 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
396 unsigned wave_divisor = (16 * num_pipes);
397 int group_size = 1;
398 int grid_size = 1;
399 unsigned lds_size = shader->local_size / 4 +
400 shader->bc.nlds_dw;
401
402
403 /* Calculate group_size/grid_size */
404 for (i = 0; i < 3; i++) {
405 group_size *= info->block[i];
406 }
407
408 for (i = 0; i < 3; i++) {
409 grid_size *= info->grid[i];
410 }
411
412 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
413 num_waves = (info->block[0] * info->block[1] * info->block[2] +
414 wave_divisor - 1) / wave_divisor;
415
416 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
417 "%u wavefronts per thread block, "
418 "allocating %u dwords lds.\n",
419 num_pipes, num_waves, lds_size);
420
421 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
422
423 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
424 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
425 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
426 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
427
428 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
429 group_size);
430
431 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
432 radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
433 radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
434 radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
435
436 if (rctx->b.chip_class < CAYMAN) {
437 assert(lds_size <= 8192);
438 } else {
439 /* Cayman appears to have a slightly smaller limit, see the
440 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
441 assert(lds_size <= 8160);
442 }
443
444 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
445 lds_size | (num_waves << 14));
446
447 /* Dispatch packet */
448 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
449 radeon_emit(cs, info->grid[0]);
450 radeon_emit(cs, info->grid[1]);
451 radeon_emit(cs, info->grid[2]);
452 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
453 radeon_emit(cs, 1);
454 }
455
456 static void compute_emit_cs(struct r600_context *rctx,
457 const struct pipe_grid_info *info)
458 {
459 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
460 unsigned i;
461
462 /* make sure that the gfx ring is only one active */
463 if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
464 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
465 }
466
467 /* Initialize all the compute-related registers.
468 *
469 * See evergreen_init_atom_start_compute_cs() in this file for the list
470 * of registers initialized by the start_compute_cs_cmd atom.
471 */
472 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
473
474 /* emit config state */
475 if (rctx->b.chip_class == EVERGREEN)
476 r600_emit_atom(rctx, &rctx->config_state.atom);
477
478 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
479 r600_flush_emit(rctx);
480
481 /* Emit colorbuffers. */
482 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
483 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
484 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
485 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
486 (struct r600_resource*)cb->base.texture,
487 RADEON_USAGE_READWRITE,
488 RADEON_PRIO_SHADER_RW_BUFFER);
489
490 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
491 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
492 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
493 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
494 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
495 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
496 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
497 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
498
499 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
500 radeon_emit(cs, reloc);
501
502 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
503 radeon_emit(cs, reloc);
504 }
505 for (; i < 8 ; i++)
506 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
507 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
508 for (; i < 12; i++)
509 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
510 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
511
512 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
513 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
514 rctx->compute_cb_target_mask);
515
516
517 /* Emit vertex buffer state */
518 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
519 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
520
521 /* Emit constant buffer state */
522 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
523
524 /* Emit sampler state */
525 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
526
527 /* Emit sampler view (texture resource) state */
528 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
529
530 /* Emit compute shader state */
531 r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
532
533 /* Emit dispatch state and dispatch packet */
534 evergreen_emit_dispatch(rctx, info);
535
536 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
537 */
538 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
539 R600_CONTEXT_INV_VERTEX_CACHE |
540 R600_CONTEXT_INV_TEX_CACHE;
541 r600_flush_emit(rctx);
542 rctx->b.flags = 0;
543
544 if (rctx->b.chip_class >= CAYMAN) {
545 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
546 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
547 /* DEALLOC_STATE prevents the GPU from hanging when a
548 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
549 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
550 */
551 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
552 cs->buf[cs->cdw++] = 0;
553 }
554
555 #if 0
556 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
557 for (i = 0; i < cs->cdw; i++) {
558 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
559 }
560 #endif
561
562 }
563
564
565 /**
566 * Emit function for r600_cs_shader_state atom
567 */
568 void evergreen_emit_cs_shader(struct r600_context *rctx,
569 struct r600_atom *atom)
570 {
571 struct r600_cs_shader_state *state =
572 (struct r600_cs_shader_state*)atom;
573 struct r600_pipe_compute *shader = state->shader;
574 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
575 uint64_t va;
576 struct r600_resource *code_bo;
577 unsigned ngpr, nstack;
578
579 code_bo = shader->code_bo;
580 va = shader->code_bo->gpu_address + state->pc;
581 ngpr = shader->bc.ngpr;
582 nstack = shader->bc.nstack;
583
584 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
585 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
586 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
587 S_0288D4_NUM_GPRS(ngpr)
588 | S_0288D4_STACK_SIZE(nstack));
589 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
590
591 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
592 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
593 code_bo, RADEON_USAGE_READ,
594 RADEON_PRIO_USER_SHADER));
595 }
596
597 static void evergreen_launch_grid(struct pipe_context *ctx,
598 const struct pipe_grid_info *info)
599 {
600 struct r600_context *rctx = (struct r600_context *)ctx;
601 #ifdef HAVE_OPENCL
602 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
603 boolean use_kill;
604
605 rctx->cs_shader_state.pc = info->pc;
606 /* Get the config information for this kernel. */
607 r600_shader_binary_read_config(&shader->binary, &shader->bc,
608 info->pc, &use_kill);
609 #endif
610
611 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
612
613
614 evergreen_compute_upload_input(ctx, info);
615 compute_emit_cs(rctx, info);
616 }
617
618 static void evergreen_set_compute_resources(struct pipe_context *ctx,
619 unsigned start, unsigned count,
620 struct pipe_surface **surfaces)
621 {
622 struct r600_context *rctx = (struct r600_context *)ctx;
623 struct r600_surface **resources = (struct r600_surface **)surfaces;
624
625 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
626 start, count);
627
628 for (unsigned i = 0; i < count; i++) {
629 /* The First two vertex buffers are reserved for parameters and
630 * global buffers. */
631 unsigned vtx_id = 2 + i;
632 if (resources[i]) {
633 struct r600_resource_global *buffer =
634 (struct r600_resource_global*)
635 resources[i]->base.texture;
636 if (resources[i]->base.writable) {
637 assert(i+1 < 12);
638
639 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
640 (struct r600_resource *)resources[i]->base.texture,
641 buffer->chunk->start_in_dw*4,
642 resources[i]->base.texture->width0);
643 }
644
645 evergreen_cs_set_vertex_buffer(rctx, vtx_id,
646 buffer->chunk->start_in_dw * 4,
647 resources[i]->base.texture);
648 }
649 }
650 }
651
652 static void evergreen_set_global_binding(struct pipe_context *ctx,
653 unsigned first, unsigned n,
654 struct pipe_resource **resources,
655 uint32_t **handles)
656 {
657 struct r600_context *rctx = (struct r600_context *)ctx;
658 struct compute_memory_pool *pool = rctx->screen->global_pool;
659 struct r600_resource_global **buffers =
660 (struct r600_resource_global **)resources;
661 unsigned i;
662
663 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
664 first, n);
665
666 if (!resources) {
667 /* XXX: Unset */
668 return;
669 }
670
671 /* We mark these items for promotion to the pool if they
672 * aren't already there */
673 for (i = first; i < first + n; i++) {
674 struct compute_memory_item *item = buffers[i]->chunk;
675
676 if (!is_item_in_pool(item))
677 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
678 }
679
680 if (compute_memory_finalize_pending(pool, ctx) == -1) {
681 /* XXX: Unset */
682 return;
683 }
684
685 for (i = first; i < first + n; i++)
686 {
687 uint32_t buffer_offset;
688 uint32_t handle;
689 assert(resources[i]->target == PIPE_BUFFER);
690 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
691
692 buffer_offset = util_le32_to_cpu(*(handles[i]));
693 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
694
695 *(handles[i]) = util_cpu_to_le32(handle);
696 }
697
698 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
699 evergreen_cs_set_vertex_buffer(rctx, 1, 0,
700 (struct pipe_resource*)pool->bo);
701 }
702
703 /**
704 * This function initializes all the compute specific registers that need to
705 * be initialized for each compute command stream. Registers that are common
706 * to both compute and 3D will be initialized at the beginning of each compute
707 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
708 * packet requires that the shader type bit be set, we must initialize all
709 * context registers needed for compute in this function. The registers
710 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
711 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
712 * on the GPU family.
713 */
714 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
715 {
716 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
717 int num_threads;
718 int num_stack_entries;
719
720 /* since all required registers are initialized in the
721 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
722 */
723 r600_init_command_buffer(cb, 256);
724 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
725
726 /* This must be first. */
727 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
728 r600_store_value(cb, 0x80000000);
729 r600_store_value(cb, 0x80000000);
730
731 /* We're setting config registers here. */
732 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
733 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
734
735 switch (rctx->b.family) {
736 case CHIP_CEDAR:
737 default:
738 num_threads = 128;
739 num_stack_entries = 256;
740 break;
741 case CHIP_REDWOOD:
742 num_threads = 128;
743 num_stack_entries = 256;
744 break;
745 case CHIP_JUNIPER:
746 num_threads = 128;
747 num_stack_entries = 512;
748 break;
749 case CHIP_CYPRESS:
750 case CHIP_HEMLOCK:
751 num_threads = 128;
752 num_stack_entries = 512;
753 break;
754 case CHIP_PALM:
755 num_threads = 128;
756 num_stack_entries = 256;
757 break;
758 case CHIP_SUMO:
759 num_threads = 128;
760 num_stack_entries = 256;
761 break;
762 case CHIP_SUMO2:
763 num_threads = 128;
764 num_stack_entries = 512;
765 break;
766 case CHIP_BARTS:
767 num_threads = 128;
768 num_stack_entries = 512;
769 break;
770 case CHIP_TURKS:
771 num_threads = 128;
772 num_stack_entries = 256;
773 break;
774 case CHIP_CAICOS:
775 num_threads = 128;
776 num_stack_entries = 256;
777 break;
778 }
779
780 /* Config Registers */
781 if (rctx->b.chip_class < CAYMAN)
782 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
783 rctx->screen->b.info.drm_minor);
784 else
785 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
786 rctx->screen->b.info.drm_minor);
787
788 /* The primitive type always needs to be POINTLIST for compute. */
789 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
790 V_008958_DI_PT_POINTLIST);
791
792 if (rctx->b.chip_class < CAYMAN) {
793
794 /* These registers control which simds can be used by each stage.
795 * The default for these registers is 0xffffffff, which means
796 * all simds are available for each stage. It's possible we may
797 * want to play around with these in the future, but for now
798 * the default value is fine.
799 *
800 * R_008E20_SQ_STATIC_THREAD_MGMT1
801 * R_008E24_SQ_STATIC_THREAD_MGMT2
802 * R_008E28_SQ_STATIC_THREAD_MGMT3
803 */
804
805 /* XXX: We may need to adjust the thread and stack resource
806 * values for 3D/compute interop */
807
808 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
809
810 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
811 * Set the number of threads used by the PS/VS/GS/ES stage to
812 * 0.
813 */
814 r600_store_value(cb, 0);
815
816 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
817 * Set the number of threads used by the CS (aka LS) stage to
818 * the maximum number of threads and set the number of threads
819 * for the HS stage to 0. */
820 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
821
822 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
823 * Set the Control Flow stack entries to 0 for PS/VS stages */
824 r600_store_value(cb, 0);
825
826 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
827 * Set the Control Flow stack entries to 0 for GS/ES stages */
828 r600_store_value(cb, 0);
829
830 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
831 * Set the Contol Flow stack entries to 0 for the HS stage, and
832 * set it to the maximum value for the CS (aka LS) stage. */
833 r600_store_value(cb,
834 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
835 }
836 /* Give the compute shader all the available LDS space.
837 * NOTE: This only sets the maximum number of dwords that a compute
838 * shader can allocate. When a shader is executed, we still need to
839 * allocate the appropriate amount of LDS dwords using the
840 * CM_R_0288E8_SQ_LDS_ALLOC register.
841 */
842 if (rctx->b.chip_class < CAYMAN) {
843 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
844 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
845 } else {
846 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
847 S_0286FC_NUM_PS_LDS(0) |
848 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
849 }
850
851 /* Context Registers */
852
853 if (rctx->b.chip_class < CAYMAN) {
854 /* workaround for hw issues with dyn gpr - must set all limits
855 * to 240 instead of 0, 0x1e == 240 / 8
856 */
857 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
858 S_028838_PS_GPRS(0x1e) |
859 S_028838_VS_GPRS(0x1e) |
860 S_028838_GS_GPRS(0x1e) |
861 S_028838_ES_GPRS(0x1e) |
862 S_028838_HS_GPRS(0x1e) |
863 S_028838_LS_GPRS(0x1e));
864 }
865
866 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
867 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
868 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
869
870 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
871
872 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
873 S_0286E8_TID_IN_GROUP_ENA
874 | S_0286E8_TGID_ENA
875 | S_0286E8_DISABLE_INDEX_PACK)
876 ;
877
878 /* The LOOP_CONST registers are an optimizations for loops that allows
879 * you to store the initial counter, increment value, and maximum
880 * counter value in a register so that hardware can calculate the
881 * correct number of iterations for the loop, so that you don't need
882 * to have the loop counter in your shader code. We don't currently use
883 * this optimization, so we must keep track of the counter in the
884 * shader and use a break instruction to exit loops. However, the
885 * hardware will still uses this register to determine when to exit a
886 * loop, so we need to initialize the counter to 0, set the increment
887 * value to 1 and the maximum counter value to the 4095 (0xfff) which
888 * is the maximum value allowed. This gives us a maximum of 4096
889 * iterations for our loops, but hopefully our break instruction will
890 * execute before some time before the 4096th iteration.
891 */
892 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
893 }
894
895 void evergreen_init_compute_state_functions(struct r600_context *rctx)
896 {
897 rctx->b.b.create_compute_state = evergreen_create_compute_state;
898 rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
899 rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
900 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
901 rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
902 rctx->b.b.set_global_binding = evergreen_set_global_binding;
903 rctx->b.b.launch_grid = evergreen_launch_grid;
904
905 }
906
907 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
908 const struct pipe_resource *templ)
909 {
910 struct r600_resource_global* result = NULL;
911 struct r600_screen* rscreen = NULL;
912 int size_in_dw = 0;
913
914 assert(templ->target == PIPE_BUFFER);
915 assert(templ->bind & PIPE_BIND_GLOBAL);
916 assert(templ->array_size == 1 || templ->array_size == 0);
917 assert(templ->depth0 == 1 || templ->depth0 == 0);
918 assert(templ->height0 == 1 || templ->height0 == 0);
919
920 result = (struct r600_resource_global*)
921 CALLOC(sizeof(struct r600_resource_global), 1);
922 rscreen = (struct r600_screen*)screen;
923
924 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
925 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
926 templ->array_size);
927
928 result->base.b.vtbl = &r600_global_buffer_vtbl;
929 result->base.b.b = *templ;
930 result->base.b.b.screen = screen;
931 pipe_reference_init(&result->base.b.b.reference, 1);
932
933 size_in_dw = (templ->width0+3) / 4;
934
935 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
936
937 if (result->chunk == NULL)
938 {
939 free(result);
940 return NULL;
941 }
942
943 return &result->base.b.b;
944 }
945
946 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
947 struct pipe_resource *res)
948 {
949 struct r600_resource_global* buffer = NULL;
950 struct r600_screen* rscreen = NULL;
951
952 assert(res->target == PIPE_BUFFER);
953 assert(res->bind & PIPE_BIND_GLOBAL);
954
955 buffer = (struct r600_resource_global*)res;
956 rscreen = (struct r600_screen*)screen;
957
958 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
959
960 buffer->chunk = NULL;
961 free(res);
962 }
963
964 void *r600_compute_global_transfer_map(struct pipe_context *ctx,
965 struct pipe_resource *resource,
966 unsigned level,
967 unsigned usage,
968 const struct pipe_box *box,
969 struct pipe_transfer **ptransfer)
970 {
971 struct r600_context *rctx = (struct r600_context*)ctx;
972 struct compute_memory_pool *pool = rctx->screen->global_pool;
973 struct r600_resource_global* buffer =
974 (struct r600_resource_global*)resource;
975
976 struct compute_memory_item *item = buffer->chunk;
977 struct pipe_resource *dst = NULL;
978 unsigned offset = box->x;
979
980 if (is_item_in_pool(item)) {
981 compute_memory_demote_item(pool, item, ctx);
982 }
983 else {
984 if (item->real_buffer == NULL) {
985 item->real_buffer =
986 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
987 }
988 }
989
990 dst = (struct pipe_resource*)item->real_buffer;
991
992 if (usage & PIPE_TRANSFER_READ)
993 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
994
995 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
996 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
997 "width = %u, height = %u, depth = %u)\n", level, usage,
998 box->x, box->y, box->z, box->width, box->height,
999 box->depth);
1000 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1001 "%u (box.x)\n", item->id, box->x);
1002
1003
1004 assert(resource->target == PIPE_BUFFER);
1005 assert(resource->bind & PIPE_BIND_GLOBAL);
1006 assert(box->x >= 0);
1007 assert(box->y == 0);
1008 assert(box->z == 0);
1009
1010 ///TODO: do it better, mapping is not possible if the pool is too big
1011 return pipe_buffer_map_range(ctx, dst,
1012 offset, box->width, usage, ptransfer);
1013 }
1014
1015 void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1016 struct pipe_transfer *transfer)
1017 {
1018 /* struct r600_resource_global are not real resources, they just map
1019 * to an offset within the compute memory pool. The function
1020 * r600_compute_global_transfer_map() maps the memory pool
1021 * resource rather than the struct r600_resource_global passed to
1022 * it as an argument and then initalizes ptransfer->resource with
1023 * the memory pool resource (via pipe_buffer_map_range).
1024 * When transfer_unmap is called it uses the memory pool's
1025 * vtable which calls r600_buffer_transfer_map() rather than
1026 * this function.
1027 */
1028 assert (!"This function should not be called");
1029 }
1030
1031 void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1032 struct pipe_transfer *transfer,
1033 const struct pipe_box *box)
1034 {
1035 assert(0 && "TODO");
1036 }
1037
1038 void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
1039 struct pipe_resource *resource,
1040 unsigned level,
1041 unsigned usage,
1042 const struct pipe_box *box,
1043 const void *data,
1044 unsigned stride,
1045 unsigned layer_stride)
1046 {
1047 assert(0 && "TODO");
1048 }