r600: cleanup whitespace in evergreen_compute.c
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #include <stdio.h>
28 #include <errno.h>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "evergreend.h"
42 #include "r600_shader.h"
43 #include "r600_pipe.h"
44 #include "r600_formats.h"
45 #include "evergreen_compute.h"
46 #include "evergreen_compute_internal.h"
47 #include "compute_memory_pool.h"
48 #include "sb/sb_public.h"
49 #ifdef HAVE_OPENCL
50 #include "radeon/radeon_llvm_util.h"
51 #endif
52 #include "radeon/radeon_elf_util.h"
53 #include <inttypes.h>
54
55 /**
56 RAT0 is for global binding write
57 VTX1 is for global binding read
58
59 for wrting images RAT1...
60 for reading images TEX2...
61 TEX2-RAT1 is paired
62
63 TEX2... consumes the same fetch resources, that VTX2... would consume
64
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
67 also constant cached
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
70
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
74
75 from Nvidia OpenCL:
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
78
79 so 10 for writing is enough. 176 is the max for reading according to the docs
80
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
83
84 */
85
86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
87 unsigned size)
88 {
89 struct pipe_resource *buffer = NULL;
90 assert(size);
91
92 buffer = pipe_buffer_create((struct pipe_screen*) screen,
93 PIPE_BIND_CUSTOM,
94 PIPE_USAGE_IMMUTABLE,
95 size);
96
97 return (struct r600_resource *)buffer;
98 }
99
100
101 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
102 unsigned id,
103 struct r600_resource *bo,
104 int start,
105 int size)
106 {
107 struct pipe_surface rat_templ;
108 struct r600_surface *surf = NULL;
109 struct r600_context *rctx = NULL;
110
111 assert(id < 12);
112 assert((size & 3) == 0);
113 assert((start & 0xFF) == 0);
114
115 rctx = pipe->ctx;
116
117 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
118
119 /* Create the RAT surface */
120 memset(&rat_templ, 0, sizeof(rat_templ));
121 rat_templ.format = PIPE_FORMAT_R32_UINT;
122 rat_templ.u.tex.level = 0;
123 rat_templ.u.tex.first_layer = 0;
124 rat_templ.u.tex.last_layer = 0;
125
126 /* Add the RAT the list of color buffers */
127 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
128 (struct pipe_context *)pipe->ctx,
129 (struct pipe_resource *)bo, &rat_templ);
130
131 /* Update the number of color buffers */
132 pipe->ctx->framebuffer.state.nr_cbufs =
133 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
134
135 /* Update the cb_target_mask
136 * XXX: I think this is a potential spot for bugs once we start doing
137 * GL interop. cb_target_mask may be modified in the 3D sections
138 * of this driver. */
139 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
140
141 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
142 evergreen_init_color_surface_rat(rctx, surf);
143 }
144
145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
146 unsigned vb_index,
147 unsigned offset,
148 struct pipe_resource *buffer)
149 {
150 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
151 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
152 vb->stride = 1;
153 vb->buffer_offset = offset;
154 vb->buffer = buffer;
155 vb->user_buffer = NULL;
156
157 /* The vertex instructions in the compute shaders use the texture cache,
158 * so we need to invalidate it. */
159 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
160 state->enabled_mask |= 1 << vb_index;
161 state->dirty_mask |= 1 << vb_index;
162 r600_mark_atom_dirty(rctx, &state->atom);
163 }
164
165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
166 unsigned cb_index,
167 unsigned offset,
168 unsigned size,
169 struct pipe_resource *buffer)
170 {
171 struct pipe_constant_buffer cb;
172 cb.buffer_size = size;
173 cb.buffer_offset = offset;
174 cb.buffer = buffer;
175 cb.user_buffer = NULL;
176
177 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
178 }
179
180 static const struct u_resource_vtbl r600_global_buffer_vtbl =
181 {
182 u_default_resource_get_handle, /* get_handle */
183 r600_compute_global_buffer_destroy, /* resource_destroy */
184 r600_compute_global_transfer_map, /* transfer_map */
185 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
186 r600_compute_global_transfer_unmap, /* transfer_unmap */
187 r600_compute_global_transfer_inline_write /* transfer_inline_write */
188 };
189
190 /* We need to define these R600 registers here, because we can't include
191 * evergreend.h and r600d.h.
192 */
193 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
194 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
195
196 #ifdef HAVE_OPENCL
197
198 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
199 struct r600_bytecode *bc,
200 uint64_t symbol_offset,
201 boolean *use_kill)
202 {
203 unsigned i;
204 const unsigned char *config =
205 radeon_shader_binary_config_start(binary, symbol_offset);
206
207 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
208 unsigned reg =
209 util_le32_to_cpu(*(uint32_t*)(config + i));
210 unsigned value =
211 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
212 switch (reg) {
213 /* R600 / R700 */
214 case R_028850_SQ_PGM_RESOURCES_PS:
215 case R_028868_SQ_PGM_RESOURCES_VS:
216 /* Evergreen / Northern Islands */
217 case R_028844_SQ_PGM_RESOURCES_PS:
218 case R_028860_SQ_PGM_RESOURCES_VS:
219 case R_0288D4_SQ_PGM_RESOURCES_LS:
220 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
221 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
222 break;
223 case R_02880C_DB_SHADER_CONTROL:
224 *use_kill = G_02880C_KILL_ENABLE(value);
225 break;
226 case R_0288E8_SQ_LDS_ALLOC:
227 bc->nlds_dw = value;
228 break;
229 }
230 }
231 }
232
233 static unsigned r600_create_shader(struct r600_bytecode *bc,
234 const struct radeon_shader_binary *binary,
235 boolean *use_kill)
236
237 {
238 assert(binary->code_size % 4 == 0);
239 bc->bytecode = CALLOC(1, binary->code_size);
240 memcpy(bc->bytecode, binary->code, binary->code_size);
241 bc->ndw = binary->code_size / 4;
242
243 r600_shader_binary_read_config(binary, bc, 0, use_kill);
244 return 0;
245 }
246
247 #endif
248
249 static void r600_destroy_shader(struct r600_bytecode *bc)
250 {
251 FREE(bc->bytecode);
252 }
253
254 void *evergreen_create_compute_state(struct pipe_context *ctx_,
255 const const struct pipe_compute_state *cso)
256 {
257 struct r600_context *ctx = (struct r600_context *)ctx_;
258 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
259 #ifdef HAVE_OPENCL
260 const struct pipe_llvm_program_header *header;
261 const char *code;
262 void *p;
263 boolean use_kill;
264
265 COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
266 header = cso->prog;
267 code = cso->prog + sizeof(struct pipe_llvm_program_header);
268 radeon_shader_binary_init(&shader->binary);
269 radeon_elf_read(code, header->num_bytes, &shader->binary);
270 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
271
272 shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
273 shader->bc.ndw * 4);
274 p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
275 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
276 ctx->b.ws->buffer_unmap(shader->code_bo->buf);
277 #endif
278
279 shader->ctx = ctx;
280 shader->local_size = cso->req_local_mem;
281 shader->private_size = cso->req_private_mem;
282 shader->input_size = cso->req_input_mem;
283
284 return shader;
285 }
286
287 void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
288 {
289 struct r600_context *ctx = (struct r600_context *)ctx_;
290 struct r600_pipe_compute *shader = state;
291
292 COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
293
294 if (!shader)
295 return;
296
297 radeon_shader_binary_clean(&shader->binary);
298 r600_destroy_shader(&shader->bc);
299
300 /* TODO destroy shader->code_bo, shader->const_bo
301 * we'll need something like r600_buffer_free */
302 FREE(shader);
303 }
304
305 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
306 {
307 struct r600_context *ctx = (struct r600_context *)ctx_;
308
309 COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
310
311 ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
312 }
313
314 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
315 * kernel parameters there are implicit parameters that need to be stored
316 * in the vertex buffer as well. Here is how these parameters are organized in
317 * the buffer:
318 *
319 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
320 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
321 * DWORDS 6-8: Number of work items within each work group in each dimension
322 * (x,y,z)
323 * DWORDS 9+ : Kernel parameters
324 */
325 void evergreen_compute_upload_input(struct pipe_context *ctx_,
326 const uint *block_layout,
327 const uint *grid_layout,
328 const void *input)
329 {
330 struct r600_context *ctx = (struct r600_context *)ctx_;
331 struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
332 unsigned i;
333 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
334 * parameters.
335 */
336 unsigned input_size = shader->input_size + 36;
337 uint32_t *num_work_groups_start;
338 uint32_t *global_size_start;
339 uint32_t *local_size_start;
340 uint32_t *kernel_parameters_start;
341 struct pipe_box box;
342 struct pipe_transfer *transfer = NULL;
343
344 if (shader->input_size == 0) {
345 return;
346 }
347
348 if (!shader->kernel_param) {
349 /* Add space for the grid dimensions */
350 shader->kernel_param = (struct r600_resource *)
351 pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
352 PIPE_USAGE_IMMUTABLE, input_size);
353 }
354
355 u_box_1d(0, input_size, &box);
356 num_work_groups_start = ctx_->transfer_map(ctx_,
357 (struct pipe_resource*)shader->kernel_param,
358 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
359 &box, &transfer);
360 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
361 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
362 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
363
364 /* Copy the work group size */
365 memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
366
367 /* Copy the global size */
368 for (i = 0; i < 3; i++) {
369 global_size_start[i] = grid_layout[i] * block_layout[i];
370 }
371
372 /* Copy the local dimensions */
373 memcpy(local_size_start, block_layout, 3 * sizeof(uint));
374
375 /* Copy the kernel inputs */
376 memcpy(kernel_parameters_start, input, shader->input_size);
377
378 for (i = 0; i < (input_size / 4); i++) {
379 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
380 ((unsigned*)num_work_groups_start)[i]);
381 }
382
383 ctx_->transfer_unmap(ctx_, transfer);
384
385 /* ID=0 is reserved for the parameters */
386 evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
387 (struct pipe_resource*)shader->kernel_param);
388 }
389
390 static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
391 const uint *block_layout,
392 const uint *grid_layout)
393 {
394 int i;
395 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
396 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
397 unsigned num_waves;
398 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
399 unsigned wave_divisor = (16 * num_pipes);
400 int group_size = 1;
401 int grid_size = 1;
402 unsigned lds_size = shader->local_size / 4 +
403 shader->bc.nlds_dw;
404
405
406 /* Calculate group_size/grid_size */
407 for (i = 0; i < 3; i++) {
408 group_size *= block_layout[i];
409 }
410
411 for (i = 0; i < 3; i++) {
412 grid_size *= grid_layout[i];
413 }
414
415 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
416 num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
417 wave_divisor - 1) / wave_divisor;
418
419 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
420 "%u wavefronts per thread block, "
421 "allocating %u dwords lds.\n",
422 num_pipes, num_waves, lds_size);
423
424 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
425
426 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
427 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
428 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
429 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
430
431 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
432 group_size);
433
434 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
435 radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
436 radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
437 radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
438
439 if (rctx->b.chip_class < CAYMAN) {
440 assert(lds_size <= 8192);
441 } else {
442 /* Cayman appears to have a slightly smaller limit, see the
443 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
444 assert(lds_size <= 8160);
445 }
446
447 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
448 lds_size | (num_waves << 14));
449
450 /* Dispatch packet */
451 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
452 radeon_emit(cs, grid_layout[0]);
453 radeon_emit(cs, grid_layout[1]);
454 radeon_emit(cs, grid_layout[2]);
455 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
456 radeon_emit(cs, 1);
457 }
458
459 static void compute_emit_cs(struct r600_context *ctx,
460 const uint *block_layout,
461 const uint *grid_layout)
462 {
463 struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
464 unsigned i;
465
466 /* make sure that the gfx ring is only one active */
467 if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
468 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
469 }
470
471 /* Initialize all the compute-related registers.
472 *
473 * See evergreen_init_atom_start_compute_cs() in this file for the list
474 * of registers initialized by the start_compute_cs_cmd atom.
475 */
476 r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
477
478 /* emit config state */
479 if (ctx->b.chip_class == EVERGREEN)
480 r600_emit_atom(ctx, &ctx->config_state.atom);
481
482 ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
483 r600_flush_emit(ctx);
484
485 /* Emit colorbuffers. */
486 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
487 for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
488 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
489 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
490 (struct r600_resource*)cb->base.texture,
491 RADEON_USAGE_READWRITE,
492 RADEON_PRIO_SHADER_RW_BUFFER);
493
494 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
495 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
496 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
497 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
498 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
499 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
500 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
501 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
502
503 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
504 radeon_emit(cs, reloc);
505
506 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
507 radeon_emit(cs, reloc);
508 }
509 for (; i < 8 ; i++)
510 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
511 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
512 for (; i < 12; i++)
513 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
514 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
515
516 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
517 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
518 ctx->compute_cb_target_mask);
519
520
521 /* Emit vertex buffer state */
522 ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
523 r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
524
525 /* Emit constant buffer state */
526 r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
527
528 /* Emit sampler state */
529 r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
530
531 /* Emit sampler view (texture resource) state */
532 r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
533
534 /* Emit compute shader state */
535 r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
536
537 /* Emit dispatch state and dispatch packet */
538 evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
539
540 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
541 */
542 ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
543 R600_CONTEXT_INV_VERTEX_CACHE |
544 R600_CONTEXT_INV_TEX_CACHE;
545 r600_flush_emit(ctx);
546 ctx->b.flags = 0;
547
548 if (ctx->b.chip_class >= CAYMAN) {
549 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
550 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
551 /* DEALLOC_STATE prevents the GPU from hanging when a
552 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
553 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
554 */
555 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
556 cs->buf[cs->cdw++] = 0;
557 }
558
559 #if 0
560 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
561 for (i = 0; i < cs->cdw; i++) {
562 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
563 }
564 #endif
565
566 }
567
568
569 /**
570 * Emit function for r600_cs_shader_state atom
571 */
572 void evergreen_emit_cs_shader(struct r600_context *rctx,
573 struct r600_atom *atom)
574 {
575 struct r600_cs_shader_state *state =
576 (struct r600_cs_shader_state*)atom;
577 struct r600_pipe_compute *shader = state->shader;
578 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
579 uint64_t va;
580 struct r600_resource *code_bo;
581 unsigned ngpr, nstack;
582
583 code_bo = shader->code_bo;
584 va = shader->code_bo->gpu_address + state->pc;
585 ngpr = shader->bc.ngpr;
586 nstack = shader->bc.nstack;
587
588 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
589 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
590 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
591 S_0288D4_NUM_GPRS(ngpr)
592 | S_0288D4_STACK_SIZE(nstack));
593 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
594
595 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
596 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
597 code_bo, RADEON_USAGE_READ,
598 RADEON_PRIO_USER_SHADER));
599 }
600
601 static void evergreen_launch_grid(struct pipe_context *ctx_,
602 const struct pipe_grid_info *info)
603 {
604 struct r600_context *ctx = (struct r600_context *)ctx_;
605 #ifdef HAVE_OPENCL
606 struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
607 boolean use_kill;
608
609 ctx->cs_shader_state.pc = info->pc;
610 /* Get the config information for this kernel. */
611 r600_shader_binary_read_config(&shader->binary, &shader->bc,
612 info->pc, &use_kill);
613 #endif
614
615 COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
616
617
618 evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
619 compute_emit_cs(ctx, info->block, info->grid);
620 }
621
622 static void evergreen_set_compute_resources(struct pipe_context *ctx_,
623 unsigned start, unsigned count,
624 struct pipe_surface **surfaces)
625 {
626 struct r600_context *ctx = (struct r600_context *)ctx_;
627 struct r600_surface **resources = (struct r600_surface **)surfaces;
628
629 COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
630 start, count);
631
632 for (unsigned i = 0; i < count; i++) {
633 /* The First two vertex buffers are reserved for parameters and
634 * global buffers. */
635 unsigned vtx_id = 2 + i;
636 if (resources[i]) {
637 struct r600_resource_global *buffer =
638 (struct r600_resource_global*)
639 resources[i]->base.texture;
640 if (resources[i]->base.writable) {
641 assert(i+1 < 12);
642
643 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
644 (struct r600_resource *)resources[i]->base.texture,
645 buffer->chunk->start_in_dw*4,
646 resources[i]->base.texture->width0);
647 }
648
649 evergreen_cs_set_vertex_buffer(ctx, vtx_id,
650 buffer->chunk->start_in_dw * 4,
651 resources[i]->base.texture);
652 }
653 }
654 }
655
656 static void evergreen_set_global_binding(struct pipe_context *ctx_,
657 unsigned first, unsigned n,
658 struct pipe_resource **resources,
659 uint32_t **handles)
660 {
661 struct r600_context *ctx = (struct r600_context *)ctx_;
662 struct compute_memory_pool *pool = ctx->screen->global_pool;
663 struct r600_resource_global **buffers =
664 (struct r600_resource_global **)resources;
665 unsigned i;
666
667 COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
668 first, n);
669
670 if (!resources) {
671 /* XXX: Unset */
672 return;
673 }
674
675 /* We mark these items for promotion to the pool if they
676 * aren't already there */
677 for (i = first; i < first + n; i++) {
678 struct compute_memory_item *item = buffers[i]->chunk;
679
680 if (!is_item_in_pool(item))
681 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
682 }
683
684 if (compute_memory_finalize_pending(pool, ctx_) == -1) {
685 /* XXX: Unset */
686 return;
687 }
688
689 for (i = first; i < first + n; i++)
690 {
691 uint32_t buffer_offset;
692 uint32_t handle;
693 assert(resources[i]->target == PIPE_BUFFER);
694 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
695
696 buffer_offset = util_le32_to_cpu(*(handles[i]));
697 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
698
699 *(handles[i]) = util_cpu_to_le32(handle);
700 }
701
702 evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
703 evergreen_cs_set_vertex_buffer(ctx, 1, 0,
704 (struct pipe_resource*)pool->bo);
705 }
706
707 /**
708 * This function initializes all the compute specific registers that need to
709 * be initialized for each compute command stream. Registers that are common
710 * to both compute and 3D will be initialized at the beginning of each compute
711 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
712 * packet requires that the shader type bit be set, we must initialize all
713 * context registers needed for compute in this function. The registers
714 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
715 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
716 * on the GPU family.
717 */
718 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
719 {
720 struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
721 int num_threads;
722 int num_stack_entries;
723
724 /* since all required registers are initialized in the
725 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
726 */
727 r600_init_command_buffer(cb, 256);
728 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
729
730 /* This must be first. */
731 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
732 r600_store_value(cb, 0x80000000);
733 r600_store_value(cb, 0x80000000);
734
735 /* We're setting config registers here. */
736 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
737 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
738
739 switch (ctx->b.family) {
740 case CHIP_CEDAR:
741 default:
742 num_threads = 128;
743 num_stack_entries = 256;
744 break;
745 case CHIP_REDWOOD:
746 num_threads = 128;
747 num_stack_entries = 256;
748 break;
749 case CHIP_JUNIPER:
750 num_threads = 128;
751 num_stack_entries = 512;
752 break;
753 case CHIP_CYPRESS:
754 case CHIP_HEMLOCK:
755 num_threads = 128;
756 num_stack_entries = 512;
757 break;
758 case CHIP_PALM:
759 num_threads = 128;
760 num_stack_entries = 256;
761 break;
762 case CHIP_SUMO:
763 num_threads = 128;
764 num_stack_entries = 256;
765 break;
766 case CHIP_SUMO2:
767 num_threads = 128;
768 num_stack_entries = 512;
769 break;
770 case CHIP_BARTS:
771 num_threads = 128;
772 num_stack_entries = 512;
773 break;
774 case CHIP_TURKS:
775 num_threads = 128;
776 num_stack_entries = 256;
777 break;
778 case CHIP_CAICOS:
779 num_threads = 128;
780 num_stack_entries = 256;
781 break;
782 }
783
784 /* Config Registers */
785 if (ctx->b.chip_class < CAYMAN)
786 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
787 ctx->screen->b.info.drm_minor);
788 else
789 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
790 ctx->screen->b.info.drm_minor);
791
792 /* The primitive type always needs to be POINTLIST for compute. */
793 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
794 V_008958_DI_PT_POINTLIST);
795
796 if (ctx->b.chip_class < CAYMAN) {
797
798 /* These registers control which simds can be used by each stage.
799 * The default for these registers is 0xffffffff, which means
800 * all simds are available for each stage. It's possible we may
801 * want to play around with these in the future, but for now
802 * the default value is fine.
803 *
804 * R_008E20_SQ_STATIC_THREAD_MGMT1
805 * R_008E24_SQ_STATIC_THREAD_MGMT2
806 * R_008E28_SQ_STATIC_THREAD_MGMT3
807 */
808
809 /* XXX: We may need to adjust the thread and stack resource
810 * values for 3D/compute interop */
811
812 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
813
814 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
815 * Set the number of threads used by the PS/VS/GS/ES stage to
816 * 0.
817 */
818 r600_store_value(cb, 0);
819
820 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
821 * Set the number of threads used by the CS (aka LS) stage to
822 * the maximum number of threads and set the number of threads
823 * for the HS stage to 0. */
824 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
825
826 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
827 * Set the Control Flow stack entries to 0 for PS/VS stages */
828 r600_store_value(cb, 0);
829
830 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
831 * Set the Control Flow stack entries to 0 for GS/ES stages */
832 r600_store_value(cb, 0);
833
834 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
835 * Set the Contol Flow stack entries to 0 for the HS stage, and
836 * set it to the maximum value for the CS (aka LS) stage. */
837 r600_store_value(cb,
838 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
839 }
840 /* Give the compute shader all the available LDS space.
841 * NOTE: This only sets the maximum number of dwords that a compute
842 * shader can allocate. When a shader is executed, we still need to
843 * allocate the appropriate amount of LDS dwords using the
844 * CM_R_0288E8_SQ_LDS_ALLOC register.
845 */
846 if (ctx->b.chip_class < CAYMAN) {
847 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
848 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
849 } else {
850 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
851 S_0286FC_NUM_PS_LDS(0) |
852 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
853 }
854
855 /* Context Registers */
856
857 if (ctx->b.chip_class < CAYMAN) {
858 /* workaround for hw issues with dyn gpr - must set all limits
859 * to 240 instead of 0, 0x1e == 240 / 8
860 */
861 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
862 S_028838_PS_GPRS(0x1e) |
863 S_028838_VS_GPRS(0x1e) |
864 S_028838_GS_GPRS(0x1e) |
865 S_028838_ES_GPRS(0x1e) |
866 S_028838_HS_GPRS(0x1e) |
867 S_028838_LS_GPRS(0x1e));
868 }
869
870 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
871 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
872 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
873
874 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
875
876 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
877 S_0286E8_TID_IN_GROUP_ENA
878 | S_0286E8_TGID_ENA
879 | S_0286E8_DISABLE_INDEX_PACK)
880 ;
881
882 /* The LOOP_CONST registers are an optimizations for loops that allows
883 * you to store the initial counter, increment value, and maximum
884 * counter value in a register so that hardware can calculate the
885 * correct number of iterations for the loop, so that you don't need
886 * to have the loop counter in your shader code. We don't currently use
887 * this optimization, so we must keep track of the counter in the
888 * shader and use a break instruction to exit loops. However, the
889 * hardware will still uses this register to determine when to exit a
890 * loop, so we need to initialize the counter to 0, set the increment
891 * value to 1 and the maximum counter value to the 4095 (0xfff) which
892 * is the maximum value allowed. This gives us a maximum of 4096
893 * iterations for our loops, but hopefully our break instruction will
894 * execute before some time before the 4096th iteration.
895 */
896 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
897 }
898
899 void evergreen_init_compute_state_functions(struct r600_context *ctx)
900 {
901 ctx->b.b.create_compute_state = evergreen_create_compute_state;
902 ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
903 ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
904 // ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
905 ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
906 ctx->b.b.set_global_binding = evergreen_set_global_binding;
907 ctx->b.b.launch_grid = evergreen_launch_grid;
908
909 }
910
911 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
912 const struct pipe_resource *templ)
913 {
914 struct r600_resource_global* result = NULL;
915 struct r600_screen* rscreen = NULL;
916 int size_in_dw = 0;
917
918 assert(templ->target == PIPE_BUFFER);
919 assert(templ->bind & PIPE_BIND_GLOBAL);
920 assert(templ->array_size == 1 || templ->array_size == 0);
921 assert(templ->depth0 == 1 || templ->depth0 == 0);
922 assert(templ->height0 == 1 || templ->height0 == 0);
923
924 result = (struct r600_resource_global*)
925 CALLOC(sizeof(struct r600_resource_global), 1);
926 rscreen = (struct r600_screen*)screen;
927
928 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
929 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
930 templ->array_size);
931
932 result->base.b.vtbl = &r600_global_buffer_vtbl;
933 result->base.b.b = *templ;
934 result->base.b.b.screen = screen;
935 pipe_reference_init(&result->base.b.b.reference, 1);
936
937 size_in_dw = (templ->width0+3) / 4;
938
939 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
940
941 if (result->chunk == NULL)
942 {
943 free(result);
944 return NULL;
945 }
946
947 return &result->base.b.b;
948 }
949
950 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
951 struct pipe_resource *res)
952 {
953 struct r600_resource_global* buffer = NULL;
954 struct r600_screen* rscreen = NULL;
955
956 assert(res->target == PIPE_BUFFER);
957 assert(res->bind & PIPE_BIND_GLOBAL);
958
959 buffer = (struct r600_resource_global*)res;
960 rscreen = (struct r600_screen*)screen;
961
962 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
963
964 buffer->chunk = NULL;
965 free(res);
966 }
967
968 void *r600_compute_global_transfer_map(struct pipe_context *ctx_,
969 struct pipe_resource *resource,
970 unsigned level,
971 unsigned usage,
972 const struct pipe_box *box,
973 struct pipe_transfer **ptransfer)
974 {
975 struct r600_context *rctx = (struct r600_context*)ctx_;
976 struct compute_memory_pool *pool = rctx->screen->global_pool;
977 struct r600_resource_global* buffer =
978 (struct r600_resource_global*)resource;
979
980 struct compute_memory_item *item = buffer->chunk;
981 struct pipe_resource *dst = NULL;
982 unsigned offset = box->x;
983
984 if (is_item_in_pool(item)) {
985 compute_memory_demote_item(pool, item, ctx_);
986 }
987 else {
988 if (item->real_buffer == NULL) {
989 item->real_buffer =
990 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
991 }
992 }
993
994 dst = (struct pipe_resource*)item->real_buffer;
995
996 if (usage & PIPE_TRANSFER_READ)
997 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
998
999 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1000 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1001 "width = %u, height = %u, depth = %u)\n", level, usage,
1002 box->x, box->y, box->z, box->width, box->height,
1003 box->depth);
1004 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1005 "%u (box.x)\n", item->id, box->x);
1006
1007
1008 assert(resource->target == PIPE_BUFFER);
1009 assert(resource->bind & PIPE_BIND_GLOBAL);
1010 assert(box->x >= 0);
1011 assert(box->y == 0);
1012 assert(box->z == 0);
1013
1014 ///TODO: do it better, mapping is not possible if the pool is too big
1015 return pipe_buffer_map_range(ctx_, dst,
1016 offset, box->width, usage, ptransfer);
1017 }
1018
1019 void r600_compute_global_transfer_unmap(struct pipe_context *ctx_,
1020 struct pipe_transfer *transfer)
1021 {
1022 /* struct r600_resource_global are not real resources, they just map
1023 * to an offset within the compute memory pool. The function
1024 * r600_compute_global_transfer_map() maps the memory pool
1025 * resource rather than the struct r600_resource_global passed to
1026 * it as an argument and then initalizes ptransfer->resource with
1027 * the memory pool resource (via pipe_buffer_map_range).
1028 * When transfer_unmap is called it uses the memory pool's
1029 * vtable which calls r600_buffer_transfer_map() rather than
1030 * this function.
1031 */
1032 assert (!"This function should not be called");
1033 }
1034
1035 void r600_compute_global_transfer_flush_region(struct pipe_context *ctx_,
1036 struct pipe_transfer *transfer,
1037 const struct pipe_box *box)
1038 {
1039 assert(0 && "TODO");
1040 }
1041
1042 void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
1043 struct pipe_resource *resource,
1044 unsigned level,
1045 unsigned usage,
1046 const struct pipe_box *box,
1047 const void *data,
1048 unsigned stride,
1049 unsigned layer_stride)
1050 {
1051 assert(0 && "TODO");
1052 }