r600g: remove r600_hw_context_priv.h, move the stuff to r600_pipe.h
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #include <stdio.h>
28 #include <errno.h>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/u_double_list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "r600.h"
42 #include "evergreend.h"
43 #include "r600_resource.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "evergreen_compute_internal.h"
49 #include "compute_memory_pool.h"
50 #ifdef HAVE_OPENCL
51 #include "llvm_wrapper.h"
52 #endif
53
54 /**
55 RAT0 is for global binding write
56 VTX1 is for global binding read
57
58 for wrting images RAT1...
59 for reading images TEX2...
60 TEX2-RAT1 is paired
61
62 TEX2... consumes the same fetch resources, that VTX2... would consume
63
64 CONST0 and VTX0 is for parameters
65 CONST0 is binding smaller input parameter buffer, and for constant indexing,
66 also constant cached
67 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
68 the constant cache can handle
69
70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
72 we should reserve another one too.=> 10 image binding for writing max.
73
74 from Nvidia OpenCL:
75 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
76 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
77
78 so 10 for writing is enough. 176 is the max for reading according to the docs
79
80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
81 writable images will consume TEX slots, VTX slots too because of linear indexing
82
83 */
84
85 static void evergreen_cs_set_vertex_buffer(
86 struct r600_context * rctx,
87 unsigned vb_index,
88 unsigned offset,
89 struct pipe_resource * buffer)
90 {
91 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
92 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
93 vb->stride = 1;
94 vb->buffer_offset = offset;
95 vb->buffer = buffer;
96 vb->user_buffer = NULL;
97
98 /* The vertex instructions in the compute shaders use the texture cache,
99 * so we need to invalidate it. */
100 rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
101 state->enabled_mask |= 1 << vb_index;
102 state->dirty_mask |= 1 << vb_index;
103 state->atom.dirty = true;
104 }
105
106 static const struct u_resource_vtbl r600_global_buffer_vtbl =
107 {
108 u_default_resource_get_handle, /* get_handle */
109 r600_compute_global_buffer_destroy, /* resource_destroy */
110 r600_compute_global_transfer_map, /* transfer_map */
111 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
112 r600_compute_global_transfer_unmap, /* transfer_unmap */
113 r600_compute_global_transfer_inline_write /* transfer_inline_write */
114 };
115
116
117 void *evergreen_create_compute_state(
118 struct pipe_context *ctx_,
119 const const struct pipe_compute_state *cso)
120 {
121 struct r600_context *ctx = (struct r600_context *)ctx_;
122 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
123
124 #ifdef HAVE_OPENCL
125 const struct pipe_llvm_program_header * header;
126 const unsigned char * code;
127 unsigned i;
128
129 COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
130
131 header = cso->prog;
132 code = cso->prog + sizeof(struct pipe_llvm_program_header);
133 #endif
134
135 shader->ctx = (struct r600_context*)ctx;
136 shader->resources = (struct evergreen_compute_resource*)
137 CALLOC(sizeof(struct evergreen_compute_resource),
138 get_compute_resource_num());
139 shader->local_size = cso->req_local_mem; ///TODO: assert it
140 shader->private_size = cso->req_private_mem;
141 shader->input_size = cso->req_input_mem;
142
143 #ifdef HAVE_OPENCL
144 shader->num_kernels = llvm_get_num_kernels(code, header->num_bytes);
145 shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
146
147 for (i = 0; i < shader->num_kernels; i++) {
148 struct r600_kernel *kernel = &shader->kernels[i];
149 kernel->llvm_module = llvm_get_kernel_module(i, code,
150 header->num_bytes);
151 }
152 #endif
153 return shader;
154 }
155
156 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
157 {
158 struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
159
160 free(shader->resources);
161 free(shader);
162 }
163
164 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
165 {
166 struct r600_context *ctx = (struct r600_context *)ctx_;
167
168 COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
169
170 ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
171 }
172
173 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
174 * kernel parameters there are inplicit parameters that need to be stored
175 * in the vertex buffer as well. Here is how these parameters are organized in
176 * the buffer:
177 *
178 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
179 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
180 * DWORDS 6-8: Number of work items within each work group in each dimension
181 * (x,y,z)
182 * DWORDS 9+ : Kernel parameters
183 */
184 void evergreen_compute_upload_input(
185 struct pipe_context *ctx_,
186 const uint *block_layout,
187 const uint *grid_layout,
188 const void *input)
189 {
190 struct r600_context *ctx = (struct r600_context *)ctx_;
191 struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
192 int i;
193 unsigned kernel_parameters_offset_bytes = 36;
194 uint32_t * num_work_groups_start;
195 uint32_t * global_size_start;
196 uint32_t * local_size_start;
197 uint32_t * kernel_parameters_start;
198
199 if (shader->input_size == 0) {
200 return;
201 }
202
203 if (!shader->kernel_param) {
204 unsigned buffer_size = shader->input_size;
205
206 /* Add space for the grid dimensions */
207 buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
208 shader->kernel_param = r600_compute_buffer_alloc_vram(
209 ctx->screen, buffer_size);
210 }
211
212 num_work_groups_start = r600_buffer_mmap_sync_with_rings(ctx, shader->kernel_param, PIPE_TRANSFER_WRITE);
213 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
214 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
215 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
216
217 /* Copy the work group size */
218 memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
219
220 /* Copy the global size */
221 for (i = 0; i < 3; i++) {
222 global_size_start[i] = grid_layout[i] * block_layout[i];
223 }
224
225 /* Copy the local dimensions */
226 memcpy(local_size_start, block_layout, 3 * sizeof(uint));
227
228 /* Copy the kernel inputs */
229 memcpy(kernel_parameters_start, input, shader->input_size);
230
231 for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
232 (shader->input_size / 4); i++) {
233 COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
234 ((unsigned*)num_work_groups_start)[i]);
235 }
236
237 ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);
238
239 ///ID=0 is reserved for the parameters
240 evergreen_cs_set_vertex_buffer(ctx, 0, 0,
241 (struct pipe_resource*)shader->kernel_param);
242 ///ID=0 is reserved for parameters
243 evergreen_set_const_cache(shader, 0, shader->kernel_param,
244 shader->input_size, 0);
245 }
246
247 static void evergreen_emit_direct_dispatch(
248 struct r600_context *rctx,
249 const uint *block_layout, const uint *grid_layout)
250 {
251 int i;
252 struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
253 unsigned num_waves;
254 unsigned num_pipes = rctx->screen->info.r600_max_pipes;
255 unsigned wave_divisor = (16 * num_pipes);
256 int group_size = 1;
257 int grid_size = 1;
258 /* XXX: Enable lds and get size from cs_shader_state */
259 unsigned lds_size = 0;
260
261 /* Calculate group_size/grid_size */
262 for (i = 0; i < 3; i++) {
263 group_size *= block_layout[i];
264 }
265
266 for (i = 0; i < 3; i++) {
267 grid_size *= grid_layout[i];
268 }
269
270 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
271 num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
272 wave_divisor - 1) / wave_divisor;
273
274 COMPUTE_DBG(rctx->screen, "Using %u pipes, there are %u wavefronts per thread block\n",
275 num_pipes, num_waves);
276
277 /* XXX: Partition the LDS between PS/CS. By default half (4096 dwords
278 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
279 * We may need to allocat the entire LDS space for Compute Shaders.
280 *
281 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
282 * CM: CM_R_0286FC_SPI_LDS_MGMT := S_0286FC_NUM_LS_LDS(lds_dwords)
283 */
284
285 r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
286
287 r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
288 r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
289 r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
290 r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
291
292 r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
293 group_size);
294
295 r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
296 r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
297 r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
298 r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
299
300 r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
301 lds_size | (num_waves << 14));
302
303 /* Dispatch packet */
304 r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
305 r600_write_value(cs, grid_layout[0]);
306 r600_write_value(cs, grid_layout[1]);
307 r600_write_value(cs, grid_layout[2]);
308 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
309 r600_write_value(cs, 1);
310 }
311
312 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
313 const uint *grid_layout)
314 {
315 struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
316 unsigned flush_flags = 0;
317 int i;
318 struct r600_resource *onebo = NULL;
319 struct evergreen_compute_resource *resources =
320 ctx->cs_shader_state.shader->resources;
321
322 /* make sure that the gfx ring is only one active */
323 if (ctx->rings.dma.cs) {
324 ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
325 }
326
327 /* Initialize all the compute-related registers.
328 *
329 * See evergreen_init_atom_start_compute_cs() in this file for the list
330 * of registers initialized by the start_compute_cs_cmd atom.
331 */
332 r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
333
334 ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
335 r600_flush_emit(ctx);
336
337 /* Emit colorbuffers. */
338 for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) {
339 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
340 unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx,
341 (struct r600_resource*)cb->base.texture,
342 RADEON_USAGE_READWRITE);
343
344 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
345 r600_write_value(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
346 r600_write_value(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
347 r600_write_value(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
348 r600_write_value(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
349 r600_write_value(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
350 r600_write_value(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
351 r600_write_value(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
352
353 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
354 r600_write_value(cs, reloc);
355
356 if (!ctx->keep_tiling_flags) {
357 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
358 r600_write_value(cs, reloc);
359 }
360
361 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
362 r600_write_value(cs, reloc);
363 }
364
365 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
366 r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
367 ctx->compute_cb_target_mask);
368
369
370 /* Emit vertex buffer state */
371 ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
372 r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
373
374 /* Emit compute shader state */
375 r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
376
377 for (i = 0; i < get_compute_resource_num(); i++) {
378 if (resources[i].enabled) {
379 int j;
380 COMPUTE_DBG(ctx->screen, "resnum: %i, cdw: %i\n", i, cs->cdw);
381
382 for (j = 0; j < resources[i].cs_end; j++) {
383 if (resources[i].do_reloc[j]) {
384 assert(resources[i].bo);
385 evergreen_emit_ctx_reloc(ctx,
386 resources[i].bo,
387 resources[i].usage);
388 }
389
390 cs->buf[cs->cdw++] = resources[i].cs[j];
391 }
392
393 if (resources[i].bo) {
394 onebo = resources[i].bo;
395 evergreen_emit_ctx_reloc(ctx,
396 resources[i].bo,
397 resources[i].usage);
398
399 ///special case for textures
400 if (resources[i].do_reloc
401 [resources[i].cs_end] == 2) {
402 evergreen_emit_ctx_reloc(ctx,
403 resources[i].bo,
404 resources[i].usage);
405 }
406 }
407 }
408 }
409
410 /* Emit dispatch state and dispatch packet */
411 evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
412
413 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
414 */
415 ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
416 r600_flush_emit(ctx);
417
418 #if 0
419 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
420 for (i = 0; i < cs->cdw; i++) {
421 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, ctx->cs->buf[i]);
422 }
423 #endif
424
425 flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
426 if (ctx->keep_tiling_flags) {
427 flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
428 }
429
430 ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags);
431
432 ctx->flags = 0;
433
434 COMPUTE_DBG(ctx->screen, "shader started\n");
435
436 ctx->ws->buffer_wait(onebo->buf, 0);
437
438 COMPUTE_DBG(ctx->screen, "...\n");
439 }
440
441
442 /**
443 * Emit function for r600_cs_shader_state atom
444 */
445 void evergreen_emit_cs_shader(
446 struct r600_context *rctx,
447 struct r600_atom *atom)
448 {
449 struct r600_cs_shader_state *state =
450 (struct r600_cs_shader_state*)atom;
451 struct r600_pipe_compute *shader = state->shader;
452 struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
453 struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
454 uint64_t va;
455
456 va = r600_resource_va(&rctx->screen->screen, &kernel->code_bo->b.b);
457
458 r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
459 r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
460 r600_write_value(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
461 S_0288D4_NUM_GPRS(kernel->bc.ngpr)
462 | S_0288D4_STACK_SIZE(kernel->bc.nstack));
463 r600_write_value(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
464
465 r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
466 r600_write_value(cs, r600_context_bo_reloc(rctx, &rctx->rings.gfx,
467 kernel->code_bo, RADEON_USAGE_READ));
468
469 rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
470 }
471
472 static void evergreen_launch_grid(
473 struct pipe_context *ctx_,
474 const uint *block_layout, const uint *grid_layout,
475 uint32_t pc, const void *input)
476 {
477 struct r600_context *ctx = (struct r600_context *)ctx_;
478
479 #ifdef HAVE_OPENCL
480 COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
481
482 struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
483 if (!shader->kernels[pc].code_bo) {
484 void *p;
485 struct r600_kernel *kernel = &shader->kernels[pc];
486 r600_compute_shader_create(ctx_, kernel->llvm_module, &kernel->bc);
487 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
488 kernel->bc.ndw * 4);
489 p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
490 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
491 ctx->ws->buffer_unmap(kernel->code_bo->cs_buf);
492 }
493 #endif
494
495 ctx->cs_shader_state.kernel_index = pc;
496 evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
497 compute_emit_cs(ctx, block_layout, grid_layout);
498 }
499
500 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
501 unsigned start, unsigned count,
502 struct pipe_surface ** surfaces)
503 {
504 struct r600_context *ctx = (struct r600_context *)ctx_;
505 struct r600_surface **resources = (struct r600_surface **)surfaces;
506
507 COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
508 start, count);
509
510 for (int i = 0; i < count; i++) {
511 /* The First two vertex buffers are reserved for parameters and
512 * global buffers. */
513 unsigned vtx_id = 2 + i;
514 if (resources[i]) {
515 struct r600_resource_global *buffer =
516 (struct r600_resource_global*)
517 resources[i]->base.texture;
518 if (resources[i]->base.writable) {
519 assert(i+1 < 12);
520
521 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
522 (struct r600_resource *)resources[i]->base.texture,
523 buffer->chunk->start_in_dw*4,
524 resources[i]->base.texture->width0);
525 }
526
527 evergreen_cs_set_vertex_buffer(ctx, vtx_id,
528 buffer->chunk->start_in_dw * 4,
529 resources[i]->base.texture);
530 }
531 }
532 }
533
534 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
535 unsigned start_slot, unsigned count,
536 struct pipe_sampler_view **views)
537 {
538 struct r600_context *ctx = (struct r600_context *)ctx_;
539 struct r600_pipe_sampler_view **resource =
540 (struct r600_pipe_sampler_view **)views;
541
542 for (int i = 0; i < count; i++) {
543 if (resource[i]) {
544 assert(i+1 < 12);
545 ///FETCH0 = VTX0 (param buffer),
546 //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
547 evergreen_set_tex_resource(ctx->cs_shader_state.shader, resource[i], i+2);
548 }
549 }
550 }
551
552 static void evergreen_bind_compute_sampler_states(
553 struct pipe_context *ctx_,
554 unsigned start_slot,
555 unsigned num_samplers,
556 void **samplers_)
557 {
558 struct r600_context *ctx = (struct r600_context *)ctx_;
559 struct compute_sampler_state ** samplers =
560 (struct compute_sampler_state **)samplers_;
561
562 for (int i = 0; i < num_samplers; i++) {
563 if (samplers[i]) {
564 evergreen_set_sampler_resource(
565 ctx->cs_shader_state.shader, samplers[i], i);
566 }
567 }
568 }
569
570 static void evergreen_set_global_binding(
571 struct pipe_context *ctx_, unsigned first, unsigned n,
572 struct pipe_resource **resources,
573 uint32_t **handles)
574 {
575 struct r600_context *ctx = (struct r600_context *)ctx_;
576 struct compute_memory_pool *pool = ctx->screen->global_pool;
577 struct r600_resource_global **buffers =
578 (struct r600_resource_global **)resources;
579
580 COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
581 first, n);
582
583 if (!resources) {
584 /* XXX: Unset */
585 return;
586 }
587
588 compute_memory_finalize_pending(pool, ctx_);
589
590 for (int i = 0; i < n; i++)
591 {
592 assert(resources[i]->target == PIPE_BUFFER);
593 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
594
595 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
596 }
597
598 evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
599 evergreen_cs_set_vertex_buffer(ctx, 1, 0,
600 (struct pipe_resource*)pool->bo);
601 }
602
603 /**
604 * This function initializes all the compute specific registers that need to
605 * be initialized for each compute command stream. Registers that are common
606 * to both compute and 3D will be initialized at the beginning of each compute
607 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
608 * packet requires that the shader type bit be set, we must initialize all
609 * context registers needed for compute in this function. The registers
610 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
611 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
612 * on the GPU family.
613 */
614 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
615 {
616 struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
617 int num_threads;
618 int num_stack_entries;
619
620 /* since all required registers are initialised in the
621 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
622 */
623 r600_init_command_buffer(cb, 256);
624 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
625
626 /* This must be first. */
627 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
628 r600_store_value(cb, 0x80000000);
629 r600_store_value(cb, 0x80000000);
630
631 /* We're setting config registers here. */
632 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
633 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
634
635 switch (ctx->family) {
636 case CHIP_CEDAR:
637 default:
638 num_threads = 128;
639 num_stack_entries = 256;
640 break;
641 case CHIP_REDWOOD:
642 num_threads = 128;
643 num_stack_entries = 256;
644 break;
645 case CHIP_JUNIPER:
646 num_threads = 128;
647 num_stack_entries = 512;
648 break;
649 case CHIP_CYPRESS:
650 case CHIP_HEMLOCK:
651 num_threads = 128;
652 num_stack_entries = 512;
653 break;
654 case CHIP_PALM:
655 num_threads = 128;
656 num_stack_entries = 256;
657 break;
658 case CHIP_SUMO:
659 num_threads = 128;
660 num_stack_entries = 256;
661 break;
662 case CHIP_SUMO2:
663 num_threads = 128;
664 num_stack_entries = 512;
665 break;
666 case CHIP_BARTS:
667 num_threads = 128;
668 num_stack_entries = 512;
669 break;
670 case CHIP_TURKS:
671 num_threads = 128;
672 num_stack_entries = 256;
673 break;
674 case CHIP_CAICOS:
675 num_threads = 128;
676 num_stack_entries = 256;
677 break;
678 }
679
680 /* Config Registers */
681 if (ctx->chip_class < CAYMAN)
682 evergreen_init_common_regs(cb, ctx->chip_class, ctx->family,
683 ctx->screen->info.drm_minor);
684 else
685 cayman_init_common_regs(cb, ctx->chip_class, ctx->family,
686 ctx->screen->info.drm_minor);
687
688 /* The primitive type always needs to be POINTLIST for compute. */
689 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
690 V_008958_DI_PT_POINTLIST);
691
692 if (ctx->chip_class < CAYMAN) {
693
694 /* These registers control which simds can be used by each stage.
695 * The default for these registers is 0xffffffff, which means
696 * all simds are available for each stage. It's possible we may
697 * want to play around with these in the future, but for now
698 * the default value is fine.
699 *
700 * R_008E20_SQ_STATIC_THREAD_MGMT1
701 * R_008E24_SQ_STATIC_THREAD_MGMT2
702 * R_008E28_SQ_STATIC_THREAD_MGMT3
703 */
704
705 /* XXX: We may need to adjust the thread and stack resouce
706 * values for 3D/compute interop */
707
708 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
709
710 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
711 * Set the number of threads used by the PS/VS/GS/ES stage to
712 * 0.
713 */
714 r600_store_value(cb, 0);
715
716 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
717 * Set the number of threads used by the CS (aka LS) stage to
718 * the maximum number of threads and set the number of threads
719 * for the HS stage to 0. */
720 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
721
722 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
723 * Set the Control Flow stack entries to 0 for PS/VS stages */
724 r600_store_value(cb, 0);
725
726 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
727 * Set the Control Flow stack entries to 0 for GS/ES stages */
728 r600_store_value(cb, 0);
729
730 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
731 * Set the Contol Flow stack entries to 0 for the HS stage, and
732 * set it to the maximum value for the CS (aka LS) stage. */
733 r600_store_value(cb,
734 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
735 }
736
737 /* Context Registers */
738
739 if (ctx->chip_class < CAYMAN) {
740 /* workaround for hw issues with dyn gpr - must set all limits
741 * to 240 instead of 0, 0x1e == 240 / 8
742 */
743 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
744 S_028838_PS_GPRS(0x1e) |
745 S_028838_VS_GPRS(0x1e) |
746 S_028838_GS_GPRS(0x1e) |
747 S_028838_ES_GPRS(0x1e) |
748 S_028838_HS_GPRS(0x1e) |
749 S_028838_LS_GPRS(0x1e));
750 }
751
752 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
753 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
754 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
755
756 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
757
758 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
759 S_0286E8_TID_IN_GROUP_ENA
760 | S_0286E8_TGID_ENA
761 | S_0286E8_DISABLE_INDEX_PACK)
762 ;
763
764 /* The LOOP_CONST registers are an optimizations for loops that allows
765 * you to store the initial counter, increment value, and maximum
766 * counter value in a register so that hardware can calculate the
767 * correct number of iterations for the loop, so that you don't need
768 * to have the loop counter in your shader code. We don't currently use
769 * this optimization, so we must keep track of the counter in the
770 * shader and use a break instruction to exit loops. However, the
771 * hardware will still uses this register to determine when to exit a
772 * loop, so we need to initialize the counter to 0, set the increment
773 * value to 1 and the maximum counter value to the 4095 (0xfff) which
774 * is the maximum value allowed. This gives us a maximum of 4096
775 * iterations for our loops, but hopefully our break instruction will
776 * execute before some time before the 4096th iteration.
777 */
778 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
779 }
780
781 void evergreen_init_compute_state_functions(struct r600_context *ctx)
782 {
783 ctx->context.create_compute_state = evergreen_create_compute_state;
784 ctx->context.delete_compute_state = evergreen_delete_compute_state;
785 ctx->context.bind_compute_state = evergreen_bind_compute_state;
786 // ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
787 ctx->context.set_compute_resources = evergreen_set_compute_resources;
788 ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
789 ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
790 ctx->context.set_global_binding = evergreen_set_global_binding;
791 ctx->context.launch_grid = evergreen_launch_grid;
792
793 /* We always use at least two vertex buffers for compute, one for
794 * parameters and one for global memory */
795 ctx->cs_vertex_buffer_state.enabled_mask =
796 ctx->cs_vertex_buffer_state.dirty_mask = 1 | 2;
797 }
798
799
800 struct pipe_resource *r600_compute_global_buffer_create(
801 struct pipe_screen *screen,
802 const struct pipe_resource *templ)
803 {
804 struct r600_resource_global* result = NULL;
805 struct r600_screen* rscreen = NULL;
806 int size_in_dw = 0;
807
808 assert(templ->target == PIPE_BUFFER);
809 assert(templ->bind & PIPE_BIND_GLOBAL);
810 assert(templ->array_size == 1 || templ->array_size == 0);
811 assert(templ->depth0 == 1 || templ->depth0 == 0);
812 assert(templ->height0 == 1 || templ->height0 == 0);
813
814 result = (struct r600_resource_global*)
815 CALLOC(sizeof(struct r600_resource_global), 1);
816 rscreen = (struct r600_screen*)screen;
817
818 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
819 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
820 templ->array_size);
821
822 result->base.b.vtbl = &r600_global_buffer_vtbl;
823 result->base.b.b.screen = screen;
824 result->base.b.b = *templ;
825 pipe_reference_init(&result->base.b.b.reference, 1);
826
827 size_in_dw = (templ->width0+3) / 4;
828
829 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
830
831 if (result->chunk == NULL)
832 {
833 free(result);
834 return NULL;
835 }
836
837 return &result->base.b.b;
838 }
839
840 void r600_compute_global_buffer_destroy(
841 struct pipe_screen *screen,
842 struct pipe_resource *res)
843 {
844 struct r600_resource_global* buffer = NULL;
845 struct r600_screen* rscreen = NULL;
846
847 assert(res->target == PIPE_BUFFER);
848 assert(res->bind & PIPE_BIND_GLOBAL);
849
850 buffer = (struct r600_resource_global*)res;
851 rscreen = (struct r600_screen*)screen;
852
853 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
854
855 buffer->chunk = NULL;
856 free(res);
857 }
858
859 void *r600_compute_global_transfer_map(
860 struct pipe_context *ctx_,
861 struct pipe_resource *resource,
862 unsigned level,
863 unsigned usage,
864 const struct pipe_box *box,
865 struct pipe_transfer **ptransfer)
866 {
867 struct r600_context *rctx = (struct r600_context*)ctx_;
868 struct compute_memory_pool *pool = rctx->screen->global_pool;
869 struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
870 struct r600_resource_global* buffer =
871 (struct r600_resource_global*)resource;
872 uint32_t* map;
873
874 compute_memory_finalize_pending(pool, ctx_);
875
876 assert(resource->target == PIPE_BUFFER);
877
878 COMPUTE_DBG(rctx->screen, "* r600_compute_global_get_transfer()\n"
879 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
880 "width = %u, height = %u, depth = %u)\n", level, usage,
881 box->x, box->y, box->z, box->width, box->height,
882 box->depth);
883
884 transfer->resource = resource;
885 transfer->level = level;
886 transfer->usage = usage;
887 transfer->box = *box;
888 transfer->stride = 0;
889 transfer->layer_stride = 0;
890
891 assert(transfer->resource->target == PIPE_BUFFER);
892 assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
893 assert(transfer->box.x >= 0);
894 assert(transfer->box.y == 0);
895 assert(transfer->box.z == 0);
896
897 ///TODO: do it better, mapping is not possible if the pool is too big
898
899 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n");
900
901 if (!(map = r600_buffer_mmap_sync_with_rings(rctx, buffer->chunk->pool->bo, transfer->usage))) {
902 util_slab_free(&rctx->pool_transfers, transfer);
903 return NULL;
904 }
905
906 *ptransfer = transfer;
907
908 COMPUTE_DBG(rctx->screen, "Buffer: %p + %u (buffer offset in global memory) "
909 "+ %u (box.x)\n", map, buffer->chunk->start_in_dw, transfer->box.x);
910 return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
911 }
912
913 void r600_compute_global_transfer_unmap(
914 struct pipe_context *ctx_,
915 struct pipe_transfer* transfer)
916 {
917 struct r600_context *ctx = NULL;
918 struct r600_resource_global* buffer = NULL;
919
920 assert(transfer->resource->target == PIPE_BUFFER);
921 assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
922
923 ctx = (struct r600_context *)ctx_;
924 buffer = (struct r600_resource_global*)transfer->resource;
925
926 COMPUTE_DBG(ctx->screen, "* r600_compute_global_transfer_unmap()\n");
927
928 ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
929 util_slab_free(&ctx->pool_transfers, transfer);
930 }
931
932 void r600_compute_global_transfer_flush_region(
933 struct pipe_context *ctx_,
934 struct pipe_transfer *transfer,
935 const struct pipe_box *box)
936 {
937 assert(0 && "TODO");
938 }
939
940 void r600_compute_global_transfer_inline_write(
941 struct pipe_context *pipe,
942 struct pipe_resource *resource,
943 unsigned level,
944 unsigned usage,
945 const struct pipe_box *box,
946 const void *data,
947 unsigned stride,
948 unsigned layer_stride)
949 {
950 assert(0 && "TODO");
951 }