radeon/llvm: fix compiling when llvm is active, but opencl isn't
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #include <stdio.h>
28 #include <errno.h>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/u_double_list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "r600.h"
42 #include "evergreend.h"
43 #include "r600_resource.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "r600_hw_context_priv.h"
49 #include "evergreen_compute_internal.h"
50 #include "compute_memory_pool.h"
51 #ifdef HAVE_OPENCL
52 #include "llvm_wrapper.h"
53 #endif
54
55 /**
56 RAT0 is for global binding write
57 VTX1 is for global binding read
58
59 for wrting images RAT1...
60 for reading images TEX2...
61 TEX2-RAT1 is paired
62
63 TEX2... consumes the same fetch resources, that VTX2... would consume
64
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
67 also constant cached
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
70
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
74
75 from Nvidia OpenCL:
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
78
79 so 10 for writing is enough. 176 is the max for reading according to the docs
80
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
83
84 */
85
86 static void evergreen_cs_set_vertex_buffer(
87 struct r600_context * rctx,
88 unsigned vb_index,
89 unsigned offset,
90 struct pipe_resource * buffer)
91 {
92 struct pipe_vertex_buffer *vb = &rctx->cs_vertex_buffer[vb_index];
93 struct r600_vertexbuf_state * state = &rctx->cs_vertex_buffer_state;
94 vb->stride = 1;
95 vb->buffer_offset = offset;
96 vb->buffer = buffer;
97 vb->user_buffer = NULL;
98
99 r600_inval_vertex_cache(rctx);
100 state->dirty_mask |= 1 << vb_index;
101 r600_atom_dirty(rctx, &state->atom);
102 }
103
104 const struct u_resource_vtbl r600_global_buffer_vtbl =
105 {
106 u_default_resource_get_handle, /* get_handle */
107 r600_compute_global_buffer_destroy, /* resource_destroy */
108 r600_compute_global_get_transfer, /* get_transfer */
109 r600_compute_global_transfer_destroy, /* transfer_destroy */
110 r600_compute_global_transfer_map, /* transfer_map */
111 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
112 r600_compute_global_transfer_unmap, /* transfer_unmap */
113 r600_compute_global_transfer_inline_write /* transfer_inline_write */
114 };
115
116
117 void *evergreen_create_compute_state(
118 struct pipe_context *ctx_,
119 const const struct pipe_compute_state *cso)
120 {
121 struct r600_context *ctx = (struct r600_context *)ctx_;
122
123 #ifdef HAVE_OPENCL
124 const struct pipe_llvm_program_header * header;
125 const unsigned char * code;
126
127 COMPUTE_DBG("*** evergreen_create_compute_state\n");
128
129 header = cso->prog;
130 code = cso->prog + sizeof(struct pipe_llvm_program_header);
131 #endif
132
133 if (!ctx->screen->screen.get_param(&ctx->screen->screen,
134 PIPE_CAP_COMPUTE)) {
135 fprintf(stderr, "Compute is not supported\n");
136 return NULL;
137 }
138 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
139
140 shader->ctx = (struct r600_context*)ctx;
141 shader->resources = (struct evergreen_compute_resource*)
142 CALLOC(sizeof(struct evergreen_compute_resource),
143 get_compute_resource_num());
144 shader->local_size = cso->req_local_mem; ///TODO: assert it
145 shader->private_size = cso->req_private_mem;
146 shader->input_size = cso->req_input_mem;
147
148 #ifdef HAVE_OPENCL
149 shader->mod = llvm_parse_bitcode(code, header->num_bytes);
150
151 r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
152 #endif
153 return shader;
154 }
155
156 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
157 {
158 struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
159
160 free(shader->resources);
161 free(shader);
162 }
163
164 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
165 {
166 struct r600_context *ctx = (struct r600_context *)ctx_;
167
168 COMPUTE_DBG("*** evergreen_bind_compute_state\n");
169
170 ctx->cs_shader = (struct r600_pipe_compute *)state;
171
172 if (!ctx->cs_shader->shader_code_bo) {
173
174 ctx->cs_shader->shader_code_bo =
175 r600_compute_buffer_alloc_vram(ctx->screen,
176 ctx->cs_shader->bc.ndw * 4);
177
178 void *p = ctx->ws->buffer_map(
179 ctx->cs_shader->shader_code_bo->cs_buf,
180 ctx->cs, PIPE_TRANSFER_WRITE);
181
182 memcpy(p, ctx->cs_shader->bc.bytecode, ctx->cs_shader->bc.ndw * 4);
183
184 ctx->ws->buffer_unmap(ctx->cs_shader->shader_code_bo->cs_buf);
185
186 }
187
188 struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
189 COMPUTE_RESOURCE_SHADER, 0);
190
191 if (ctx->chip_class < CAYMAN) {
192 evergreen_reg_set(res, R_008C0C_SQ_GPR_RESOURCE_MGMT_3,
193 S_008C0C_NUM_LS_GPRS(ctx->cs_shader->bc.ngpr));
194 }
195
196 ///maybe we can use it later
197 evergreen_reg_set(res, R_0286C8_SPI_THREAD_GROUPING, 0);
198 ///maybe we can use it later
199 evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
200
201 evergreen_reg_set(res, R_0288D4_SQ_PGM_RESOURCES_LS,
202 S_0288D4_NUM_GPRS(ctx->cs_shader->bc.ngpr)
203 | S_0288D4_STACK_SIZE(ctx->cs_shader->bc.nstack));
204 evergreen_reg_set(res, R_0288D8_SQ_PGM_RESOURCES_LS_2, 0);
205
206 evergreen_reg_set(res, R_0288D0_SQ_PGM_START_LS, 0);
207 res->bo = ctx->cs_shader->shader_code_bo;
208 res->usage = RADEON_USAGE_READ;
209 res->coher_bo_size = ctx->cs_shader->bc.ndw*4;
210
211 r600_inval_shader_cache(ctx);
212
213 }
214
215 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
216 * kernel parameters there are inplicit parameters that need to be stored
217 * in the vertex buffer as well. Here is how these parameters are organized in
218 * the buffer:
219 *
220 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
221 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
222 * DWORDS 6-8: Number of work items within each work group in each dimension
223 * (x,y,z)
224 * DWORDS 9+ : Kernel parameters
225 */
226 void evergreen_compute_upload_input(
227 struct pipe_context *ctx_,
228 const uint *block_layout,
229 const uint *grid_layout,
230 const void *input)
231 {
232 struct r600_context *ctx = (struct r600_context *)ctx_;
233 int i;
234 unsigned kernel_parameters_offset_bytes = 36;
235 uint32_t * num_work_groups_start;
236 uint32_t * global_size_start;
237 uint32_t * local_size_start;
238 uint32_t * kernel_parameters_start;
239
240 if (ctx->cs_shader->input_size == 0) {
241 return;
242 }
243
244 if (!ctx->cs_shader->kernel_param) {
245 unsigned buffer_size = ctx->cs_shader->input_size;
246
247 /* Add space for the grid dimensions */
248 buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
249 ctx->cs_shader->kernel_param =
250 r600_compute_buffer_alloc_vram(ctx->screen,
251 buffer_size);
252 }
253
254 num_work_groups_start = ctx->ws->buffer_map(
255 ctx->cs_shader->kernel_param->cs_buf,
256 ctx->cs, PIPE_TRANSFER_WRITE);
257 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
258 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
259 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
260
261 /* Copy the work group size */
262 memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
263
264 /* Copy the global size */
265 for (i = 0; i < 3; i++) {
266 global_size_start[i] = grid_layout[i] * block_layout[i];
267 }
268
269 /* Copy the local dimensions */
270 memcpy(local_size_start, block_layout, 3 * sizeof(uint));
271
272 /* Copy the kernel inputs */
273 memcpy(kernel_parameters_start, input, ctx->cs_shader->input_size);
274
275 for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
276 (ctx->cs_shader->input_size / 4); i++) {
277 COMPUTE_DBG("input %i : %i\n", i,
278 ((unsigned*)num_work_groups_start)[i]);
279 }
280
281 ctx->ws->buffer_unmap(ctx->cs_shader->kernel_param->cs_buf);
282
283 ///ID=0 is reserved for the parameters
284 evergreen_cs_set_vertex_buffer(ctx, 0, 0,
285 (struct pipe_resource*)ctx->cs_shader->kernel_param);
286 ///ID=0 is reserved for parameters
287 evergreen_set_const_cache(ctx->cs_shader, 0,
288 ctx->cs_shader->kernel_param, ctx->cs_shader->input_size, 0);
289 }
290
291 void evergreen_direct_dispatch(
292 struct pipe_context *ctx_,
293 const uint *block_layout, const uint *grid_layout)
294 {
295 /* This struct r600_context* must be called rctx, because the
296 * r600_pipe_state_add_reg macro assumes there is a local variable
297 * of type struct r600_context* called rctx.
298 */
299 struct r600_context *rctx = (struct r600_context *)ctx_;
300
301 int i;
302
303 struct evergreen_compute_resource* res = get_empty_res(rctx->cs_shader,
304 COMPUTE_RESOURCE_DISPATCH, 0);
305
306 /* Set CB_TARGET_MASK */
307 evergreen_reg_set(res, R_028238_CB_TARGET_MASK, rctx->compute_cb_target_mask);
308
309 evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST);
310
311 evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0);
312 evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0);
313 evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0);
314
315 evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]);
316 evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]);
317 evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]);
318
319 int group_size = 1;
320
321 int grid_size = 1;
322
323 for (i = 0; i < 3; i++) {
324 group_size *= block_layout[i];
325 }
326
327 for (i = 0; i < 3; i++) {
328 grid_size *= grid_layout[i];
329 }
330
331 evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size);
332 evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size);
333
334 evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
335 evergreen_emit_raw_value(res, grid_layout[0]);
336 evergreen_emit_raw_value(res, grid_layout[1]);
337 evergreen_emit_raw_value(res, grid_layout[2]);
338 ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
339 evergreen_emit_raw_value(res, 1);
340 }
341
342 static void compute_emit_cs(struct r600_context *ctx)
343 {
344 struct radeon_winsys_cs *cs = ctx->cs;
345 int i;
346
347 struct r600_resource *onebo = NULL;
348 struct r600_pipe_state *cb_state;
349
350 /* Initialize all the registers common to both 3D and compute. Some
351 * 3D only register will be initialized by this atom as well, but
352 * this is OK for now.
353 *
354 * See evergreen_init_atom_start_cs() or cayman_init_atom_start_cs() in
355 * evergreen_state.c for the list of registers that are intialized by
356 * the start_cs_cmd atom.
357 */
358 r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
359
360 /* Initialize all the compute specific registers.
361 *
362 * See evergreen_init_atom_start_compute_cs() in this file for the list
363 * of registers initialized by the start_compuet_cs_cmd atom.
364 */
365 r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom);
366
367 /* Emit cb_state */
368 cb_state = ctx->states[R600_PIPE_STATE_FRAMEBUFFER];
369 r600_context_pipe_state_emit(ctx, cb_state, RADEON_CP_PACKET3_COMPUTE_MODE);
370
371 /* Emit vertex buffer state */
372 ctx->cs_vertex_buffer_state.atom.num_dw = 12 * ctx->nr_cs_vertex_buffers;
373 r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
374
375 for (i = 0; i < get_compute_resource_num(); i++) {
376 if (ctx->cs_shader->resources[i].enabled) {
377 int j;
378 COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
379
380 for (j = 0; j < ctx->cs_shader->resources[i].cs_end; j++) {
381 if (ctx->cs_shader->resources[i].do_reloc[j]) {
382 assert(ctx->cs_shader->resources[i].bo);
383 evergreen_emit_ctx_reloc(ctx,
384 ctx->cs_shader->resources[i].bo,
385 ctx->cs_shader->resources[i].usage);
386 }
387
388 cs->buf[cs->cdw++] = ctx->cs_shader->resources[i].cs[j];
389 }
390
391 if (ctx->cs_shader->resources[i].bo) {
392 onebo = ctx->cs_shader->resources[i].bo;
393 evergreen_emit_ctx_reloc(ctx,
394 ctx->cs_shader->resources[i].bo,
395 ctx->cs_shader->resources[i].usage);
396
397 ///special case for textures
398 if (ctx->cs_shader->resources[i].do_reloc
399 [ctx->cs_shader->resources[i].cs_end] == 2) {
400 evergreen_emit_ctx_reloc(ctx,
401 ctx->cs_shader->resources[i].bo,
402 ctx->cs_shader->resources[i].usage);
403 }
404 }
405 }
406 }
407
408 /* r600_flush_framebuffer() updates the cb_flush_flags and then
409 * calls r600_emit_atom() on the ctx->surface_sync_cmd.atom, which emits
410 * a SURFACE_SYNC packet via r600_emit_surface_sync().
411 *
412 * XXX r600_emit_surface_sync() hardcodes the CP_COHER_SIZE to
413 * 0xffffffff, so we will need to add a field to struct
414 * r600_surface_sync_cmd if we want to manually set this value.
415 */
416 r600_flush_framebuffer(ctx, true /* Flush now */);
417
418 #if 0
419 COMPUTE_DBG("cdw: %i\n", cs->cdw);
420 for (i = 0; i < cs->cdw; i++) {
421 COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
422 }
423 #endif
424
425 ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE);
426
427 ctx->pm4_dirty_cdwords = 0;
428 ctx->flags = 0;
429
430 COMPUTE_DBG("shader started\n");
431
432 ctx->ws->buffer_wait(onebo->buf, 0);
433
434 COMPUTE_DBG("...\n");
435
436 ctx->streamout_start = TRUE;
437 ctx->streamout_append_bitmask = ~0;
438
439 }
440
441 static void evergreen_launch_grid(
442 struct pipe_context *ctx_,
443 const uint *block_layout, const uint *grid_layout,
444 uint32_t pc, const void *input)
445 {
446 COMPUTE_DBG("PC: %i\n", pc);
447
448 struct r600_context *ctx = (struct r600_context *)ctx_;
449 unsigned num_waves;
450 unsigned num_pipes = ctx->screen->info.r600_max_pipes;
451 unsigned wave_divisor = (16 * num_pipes);
452
453 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
454 num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
455 wave_divisor - 1) / wave_divisor;
456
457 COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
458 num_pipes, num_waves);
459
460 evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves);
461 evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
462 evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
463 compute_emit_cs(ctx);
464 }
465
466 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
467 unsigned start, unsigned count,
468 struct pipe_surface ** surfaces)
469 {
470 struct r600_context *ctx = (struct r600_context *)ctx_;
471 struct r600_surface **resources = (struct r600_surface **)surfaces;
472
473 COMPUTE_DBG("*** evergreen_set_compute_resources: start = %u count = %u\n",
474 start, count);
475
476 for (int i = 0; i < count; i++) {
477 /* The First two vertex buffers are reserved for parameters and
478 * global buffers. */
479 unsigned vtx_id = 2 + i;
480 if (resources[i]) {
481 struct r600_resource_global *buffer =
482 (struct r600_resource_global*)
483 resources[i]->base.texture;
484 if (resources[i]->base.writable) {
485 assert(i+1 < 12);
486
487 evergreen_set_rat(ctx->cs_shader, i+1,
488 (struct r600_resource *)resources[i]->base.texture,
489 buffer->chunk->start_in_dw*4,
490 resources[i]->base.texture->width0);
491 }
492
493 evergreen_cs_set_vertex_buffer(ctx, vtx_id,
494 buffer->chunk->start_in_dw * 4,
495 resources[i]->base.texture);
496 ctx->nr_cs_vertex_buffers = vtx_id + 1;
497 }
498 }
499
500 }
501
502 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
503 unsigned start_slot, unsigned count,
504 struct pipe_sampler_view **views)
505 {
506 struct r600_context *ctx = (struct r600_context *)ctx_;
507 struct r600_pipe_sampler_view **resource =
508 (struct r600_pipe_sampler_view **)views;
509
510 for (int i = 0; i < count; i++) {
511 if (resource[i]) {
512 assert(i+1 < 12);
513 ///FETCH0 = VTX0 (param buffer),
514 //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
515 evergreen_set_tex_resource(ctx->cs_shader, resource[i], i+2);
516 }
517 }
518 }
519
520 static void evergreen_bind_compute_sampler_states(
521 struct pipe_context *ctx_,
522 unsigned start_slot,
523 unsigned num_samplers,
524 void **samplers_)
525 {
526 struct r600_context *ctx = (struct r600_context *)ctx_;
527 struct compute_sampler_state ** samplers =
528 (struct compute_sampler_state **)samplers_;
529
530 for (int i = 0; i < num_samplers; i++) {
531 if (samplers[i]) {
532 evergreen_set_sampler_resource(ctx->cs_shader, samplers[i], i);
533 }
534 }
535 }
536
537 static void evergreen_set_global_binding(
538 struct pipe_context *ctx_, unsigned first, unsigned n,
539 struct pipe_resource **resources,
540 uint32_t **handles)
541 {
542 struct r600_context *ctx = (struct r600_context *)ctx_;
543 struct compute_memory_pool *pool = ctx->screen->global_pool;
544 struct r600_resource_global **buffers =
545 (struct r600_resource_global **)resources;
546
547 COMPUTE_DBG("*** evergreen_set_global_binding first = %u n = %u\n",
548 first, n);
549
550 if (!resources) {
551 /* XXX: Unset */
552 return;
553 }
554
555 compute_memory_finalize_pending(pool, ctx_);
556
557 for (int i = 0; i < n; i++)
558 {
559 assert(resources[i]->target == PIPE_BUFFER);
560 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
561
562 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
563 }
564
565 evergreen_set_rat(ctx->cs_shader, 0, pool->bo, 0, pool->size_in_dw * 4);
566 evergreen_cs_set_vertex_buffer(ctx, 1, 0,
567 (struct pipe_resource*)pool->bo);
568 }
569
570 /**
571 * This function initializes all the compute specific registers that need to
572 * be initialized for each compute command stream. Registers that are common
573 * to both compute and 3D will be initialized at the beginning of each compute
574 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
575 * packet requires that the shader type bit be set, we must initialize all
576 * context registers needed for compute in this function. The registers
577 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
578 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
579 * on the GPU family.
580 */
581 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
582 {
583 struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
584 int num_threads;
585 int num_stack_entries;
586
587 /* We aren't passing the EMIT_EARLY flag as the third argument
588 * because we will be emitting this atom manually in order to
589 * ensure it gets emitted after the start_cs_cmd atom.
590 */
591 r600_init_command_buffer(cb, 256, 0);
592 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
593
594 switch (ctx->family) {
595 case CHIP_CEDAR:
596 default:
597 num_threads = 128;
598 num_stack_entries = 256;
599 break;
600 case CHIP_REDWOOD:
601 num_threads = 128;
602 num_stack_entries = 256;
603 break;
604 case CHIP_JUNIPER:
605 num_threads = 128;
606 num_stack_entries = 512;
607 break;
608 case CHIP_CYPRESS:
609 case CHIP_HEMLOCK:
610 num_threads = 128;
611 num_stack_entries = 512;
612 break;
613 case CHIP_PALM:
614 num_threads = 128;
615 num_stack_entries = 256;
616 break;
617 case CHIP_SUMO:
618 num_threads = 128;
619 num_stack_entries = 256;
620 break;
621 case CHIP_SUMO2:
622 num_threads = 128;
623 num_stack_entries = 512;
624 break;
625 case CHIP_BARTS:
626 num_threads = 128;
627 num_stack_entries = 512;
628 break;
629 case CHIP_TURKS:
630 num_threads = 128;
631 num_stack_entries = 256;
632 break;
633 case CHIP_CAICOS:
634 num_threads = 128;
635 num_stack_entries = 256;
636 break;
637 }
638
639 /* Config Registers */
640 if (ctx->chip_class < CAYMAN) {
641
642 /* These registers control which simds can be used by each stage.
643 * The default for these registers is 0xffffffff, which means
644 * all simds are available for each stage. It's possible we may
645 * want to play around with these in the future, but for now
646 * the default value is fine.
647 *
648 * R_008E20_SQ_STATIC_THREAD_MGMT1
649 * R_008E24_SQ_STATIC_THREAD_MGMT2
650 * R_008E28_SQ_STATIC_THREAD_MGMT3
651 */
652
653 /* XXX: We may need to adjust the thread and stack resouce
654 * values for 3D/compute interop */
655
656 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
657
658 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
659 * Set the number of threads used by the PS/VS/GS/ES stage to
660 * 0.
661 */
662 r600_store_value(cb, 0);
663
664 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
665 * Set the number of threads used by the CS (aka LS) stage to
666 * the maximum number of threads and set the number of threads
667 * for the HS stage to 0. */
668 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
669
670 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
671 * Set the Control Flow stack entries to 0 for PS/VS stages */
672 r600_store_value(cb, 0);
673
674 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
675 * Set the Control Flow stack entries to 0 for GS/ES stages */
676 r600_store_value(cb, 0);
677
678 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
679 * Set the Contol Flow stack entries to 0 for the HS stage, and
680 * set it to the maximum value for the CS (aka LS) stage. */
681 r600_store_value(cb,
682 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
683 }
684
685 /* Context Registers */
686
687 if (ctx->chip_class < CAYMAN) {
688 /* workaround for hw issues with dyn gpr - must set all limits
689 * to 240 instead of 0, 0x1e == 240 / 8
690 */
691 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
692 S_028838_PS_GPRS(0x1e) |
693 S_028838_VS_GPRS(0x1e) |
694 S_028838_GS_GPRS(0x1e) |
695 S_028838_ES_GPRS(0x1e) |
696 S_028838_HS_GPRS(0x1e) |
697 S_028838_LS_GPRS(0x1e));
698 }
699
700 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
701 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
702 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
703
704 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
705
706 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
707 S_0286E8_TID_IN_GROUP_ENA
708 | S_0286E8_TGID_ENA
709 | S_0286E8_DISABLE_INDEX_PACK)
710 ;
711
712 /* The LOOP_CONST registers are an optimizations for loops that allows
713 * you to store the initial counter, increment value, and maximum
714 * counter value in a register so that hardware can calculate the
715 * correct number of iterations for the loop, so that you don't need
716 * to have the loop counter in your shader code. We don't currently use
717 * this optimization, so we must keep track of the counter in the
718 * shader and use a break instruction to exit loops. However, the
719 * hardware will still uses this register to determine when to exit a
720 * loop, so we need to initialize the counter to 0, set the increment
721 * value to 1 and the maximum counter value to the 4095 (0xfff) which
722 * is the maximum value allowed. This gives us a maximum of 4096
723 * iterations for our loops, but hopefully our break instruction will
724 * execute before some time before the 4096th iteration.
725 */
726 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
727 }
728
729 void evergreen_init_compute_state_functions(struct r600_context *ctx)
730 {
731 ctx->context.create_compute_state = evergreen_create_compute_state;
732 ctx->context.delete_compute_state = evergreen_delete_compute_state;
733 ctx->context.bind_compute_state = evergreen_bind_compute_state;
734 // ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
735 ctx->context.set_compute_resources = evergreen_set_compute_resources;
736 ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
737 ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
738 ctx->context.set_global_binding = evergreen_set_global_binding;
739 ctx->context.launch_grid = evergreen_launch_grid;
740
741 /* We always use at least two vertex buffers for compute, one for
742 * parameters and one for global memory */
743 ctx->nr_cs_vertex_buffers = 2;
744 }
745
746
747 struct pipe_resource *r600_compute_global_buffer_create(
748 struct pipe_screen *screen,
749 const struct pipe_resource *templ)
750 {
751 assert(templ->target == PIPE_BUFFER);
752 assert(templ->bind & PIPE_BIND_GLOBAL);
753 assert(templ->array_size == 1 || templ->array_size == 0);
754 assert(templ->depth0 == 1 || templ->depth0 == 0);
755 assert(templ->height0 == 1 || templ->height0 == 0);
756
757 struct r600_resource_global* result = (struct r600_resource_global*)
758 CALLOC(sizeof(struct r600_resource_global), 1);
759 struct r600_screen* rscreen = (struct r600_screen*)screen;
760
761 COMPUTE_DBG("*** r600_compute_global_buffer_create\n");
762 COMPUTE_DBG("width = %u array_size = %u\n", templ->width0,
763 templ->array_size);
764
765 result->base.b.vtbl = &r600_global_buffer_vtbl;
766 result->base.b.b.screen = screen;
767 result->base.b.b = *templ;
768 pipe_reference_init(&result->base.b.b.reference, 1);
769
770 int size_in_dw = (templ->width0+3) / 4;
771
772 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
773
774 if (result->chunk == NULL)
775 {
776 free(result);
777 return NULL;
778 }
779
780 return &result->base.b.b;
781 }
782
783 void r600_compute_global_buffer_destroy(
784 struct pipe_screen *screen,
785 struct pipe_resource *res)
786 {
787 assert(res->target == PIPE_BUFFER);
788 assert(res->bind & PIPE_BIND_GLOBAL);
789
790 struct r600_resource_global* buffer = (struct r600_resource_global*)res;
791 struct r600_screen* rscreen = (struct r600_screen*)screen;
792
793 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
794
795 buffer->chunk = NULL;
796 free(res);
797 }
798
799 void* r600_compute_global_transfer_map(
800 struct pipe_context *ctx_,
801 struct pipe_transfer* transfer)
802 {
803 assert(transfer->resource->target == PIPE_BUFFER);
804 assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
805 assert(transfer->box.x >= 0);
806 assert(transfer->box.y == 0);
807 assert(transfer->box.z == 0);
808
809 struct r600_context *ctx = (struct r600_context *)ctx_;
810 struct r600_resource_global* buffer =
811 (struct r600_resource_global*)transfer->resource;
812
813 uint32_t* map;
814 ///TODO: do it better, mapping is not possible if the pool is too big
815
816 if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
817 ctx->cs, transfer->usage))) {
818 return NULL;
819 }
820
821 COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
822 return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
823 }
824
825 void r600_compute_global_transfer_unmap(
826 struct pipe_context *ctx_,
827 struct pipe_transfer* transfer)
828 {
829 assert(transfer->resource->target == PIPE_BUFFER);
830 assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
831
832 struct r600_context *ctx = (struct r600_context *)ctx_;
833 struct r600_resource_global* buffer =
834 (struct r600_resource_global*)transfer->resource;
835
836 ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
837 }
838
839 struct pipe_transfer * r600_compute_global_get_transfer(
840 struct pipe_context *ctx_,
841 struct pipe_resource *resource,
842 unsigned level,
843 unsigned usage,
844 const struct pipe_box *box)
845 {
846 struct r600_context *ctx = (struct r600_context *)ctx_;
847 struct compute_memory_pool *pool = ctx->screen->global_pool;
848
849 compute_memory_finalize_pending(pool, ctx_);
850
851 assert(resource->target == PIPE_BUFFER);
852 struct r600_context *rctx = (struct r600_context*)ctx_;
853 struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
854
855 transfer->resource = resource;
856 transfer->level = level;
857 transfer->usage = usage;
858 transfer->box = *box;
859 transfer->stride = 0;
860 transfer->layer_stride = 0;
861 transfer->data = NULL;
862
863 /* Note strides are zero, this is ok for buffers, but not for
864 * textures 2d & higher at least.
865 */
866 return transfer;
867 }
868
869 void r600_compute_global_transfer_destroy(
870 struct pipe_context *ctx_,
871 struct pipe_transfer *transfer)
872 {
873 struct r600_context *rctx = (struct r600_context*)ctx_;
874 util_slab_free(&rctx->pool_transfers, transfer);
875 }
876
877 void r600_compute_global_transfer_flush_region(
878 struct pipe_context *ctx_,
879 struct pipe_transfer *transfer,
880 const struct pipe_box *box)
881 {
882 assert(0 && "TODO");
883 }
884
885 void r600_compute_global_transfer_inline_write(
886 struct pipe_context *pipe,
887 struct pipe_resource *resource,
888 unsigned level,
889 unsigned usage,
890 const struct pipe_box *box,
891 const void *data,
892 unsigned stride,
893 unsigned layer_stride)
894 {
895 assert(0 && "TODO");
896 }