4389021dab71f268b10b84f235528fe60718003f
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #include <stdio.h>
28 #include <errno.h>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/u_double_list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "r600.h"
42 #include "evergreend.h"
43 #include "r600_resource.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "r600_hw_context_priv.h"
49 #include "evergreen_compute_internal.h"
50 #include "compute_memory_pool.h"
51 #ifdef HAVE_OPENCL
52 #include "llvm_wrapper.h"
53 #endif
54
55 /**
56 RAT0 is for global binding write
57 VTX1 is for global binding read
58
59 for wrting images RAT1...
60 for reading images TEX2...
61 TEX2-RAT1 is paired
62
63 TEX2... consumes the same fetch resources, that VTX2... would consume
64
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
67 also constant cached
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
70
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
74
75 from Nvidia OpenCL:
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
78
79 so 10 for writing is enough. 176 is the max for reading according to the docs
80
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
83
84 */
85
86 const struct u_resource_vtbl r600_global_buffer_vtbl =
87 {
88 u_default_resource_get_handle, /* get_handle */
89 r600_compute_global_buffer_destroy, /* resource_destroy */
90 r600_compute_global_get_transfer, /* get_transfer */
91 r600_compute_global_transfer_destroy, /* transfer_destroy */
92 r600_compute_global_transfer_map, /* transfer_map */
93 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
94 r600_compute_global_transfer_unmap, /* transfer_unmap */
95 r600_compute_global_transfer_inline_write /* transfer_inline_write */
96 };
97
98
99 void *evergreen_create_compute_state(
100 struct pipe_context *ctx_,
101 const const struct pipe_compute_state *cso)
102 {
103 struct r600_context *ctx = (struct r600_context *)ctx_;
104
105 #ifdef HAVE_OPENCL
106 const struct pipe_llvm_program_header * header;
107 const unsigned char * code;
108
109 header = cso->prog;
110 code = cso->prog + sizeof(struct pipe_llvm_program_header);
111 #endif
112
113 if (!ctx->screen->screen.get_param(&ctx->screen->screen,
114 PIPE_CAP_COMPUTE)) {
115 fprintf(stderr, "Compute is not supported\n");
116 return NULL;
117 }
118 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
119
120 shader->ctx = (struct r600_context*)ctx;
121 shader->resources = (struct evergreen_compute_resource*)
122 CALLOC(sizeof(struct evergreen_compute_resource),
123 get_compute_resource_num());
124 shader->local_size = cso->req_local_mem; ///TODO: assert it
125 shader->private_size = cso->req_private_mem;
126 shader->input_size = cso->req_input_mem;
127
128 #ifdef HAVE_OPENCL
129 shader->mod = llvm_parse_bitcode(code, header->num_bytes);
130
131 r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
132 #endif
133 return shader;
134 }
135
136 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
137 {
138 struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
139
140 free(shader->resources);
141 free(shader);
142 }
143
144 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
145 {
146 struct r600_context *ctx = (struct r600_context *)ctx_;
147
148 ctx->cs_shader = (struct r600_pipe_compute *)state;
149
150 if (!ctx->cs_shader->shader_code_bo) {
151
152 ctx->cs_shader->shader_code_bo =
153 r600_compute_buffer_alloc_vram(ctx->screen,
154 ctx->cs_shader->bc.ndw * 4);
155
156 void *p = ctx->ws->buffer_map(
157 ctx->cs_shader->shader_code_bo->cs_buf,
158 ctx->cs, PIPE_TRANSFER_WRITE);
159
160 memcpy(p, ctx->cs_shader->bc.bytecode, ctx->cs_shader->bc.ndw * 4);
161
162 ctx->ws->buffer_unmap(ctx->cs_shader->shader_code_bo->cs_buf);
163
164 }
165
166 evergreen_compute_init_config(ctx);
167
168 struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
169 COMPUTE_RESOURCE_SHADER, 0);
170
171 if (ctx->chip_class < CAYMAN) {
172 evergreen_reg_set(res, R_008C0C_SQ_GPR_RESOURCE_MGMT_3,
173 S_008C0C_NUM_LS_GPRS(ctx->cs_shader->bc.ngpr));
174 }
175
176 ///maybe we can use it later
177 evergreen_reg_set(res, R_0286C8_SPI_THREAD_GROUPING, 0);
178 ///maybe we can use it later
179 evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
180
181 evergreen_reg_set(res, R_0288D4_SQ_PGM_RESOURCES_LS,
182 S_0288D4_NUM_GPRS(ctx->cs_shader->bc.ngpr)
183 | S_0288D4_STACK_SIZE(ctx->cs_shader->bc.nstack));
184 evergreen_reg_set(res, R_0288D8_SQ_PGM_RESOURCES_LS_2, 0);
185
186 evergreen_reg_set(res, R_0288D0_SQ_PGM_START_LS, 0);
187 res->bo = ctx->cs_shader->shader_code_bo;
188 res->usage = RADEON_USAGE_READ;
189 res->coher_bo_size = ctx->cs_shader->bc.ndw*4;
190
191 r600_inval_shader_cache(ctx);
192
193 /* We can't always determine the
194 * number of iterations in a loop before it's executed,
195 * so we just need to set up the loop counter to give us the maximum
196 * number of iterations possible. Currently, loops in shader code
197 * ignore the loop counter and use a break instruction to exit the
198 * loop at the correct time.
199 */
200 evergreen_set_loop_const(ctx->cs_shader,
201 0, /* index */
202 0xFFF, /* Maximum value of the loop counter (i.e. when the loop
203 * counter reaches this value, the program will break
204 * out of the loop. */
205 0x0, /* Starting value of the loop counter. */
206 0x1); /* Amount to increment the loop counter each iteration. */
207 }
208
209 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
210 * kernel parameters there are inplicit parameters that need to be stored
211 * in the vertex buffer as well. Here is how these parameters are organized in
212 * the buffer:
213 *
214 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
215 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
216 * DWORDS 6-8: Number of work items within each work group in each dimension
217 * (x,y,z)
218 * DWORDS 9+ : Kernel parameters
219 */
220 void evergreen_compute_upload_input(
221 struct pipe_context *ctx_,
222 const uint *block_layout,
223 const uint *grid_layout,
224 const void *input)
225 {
226 struct r600_context *ctx = (struct r600_context *)ctx_;
227 int i;
228 unsigned kernel_parameters_offset_bytes = 36;
229 uint32_t * num_work_groups_start;
230 uint32_t * global_size_start;
231 uint32_t * local_size_start;
232 uint32_t * kernel_parameters_start;
233
234 if (ctx->cs_shader->input_size == 0) {
235 return;
236 }
237
238 if (!ctx->cs_shader->kernel_param) {
239 unsigned buffer_size = ctx->cs_shader->input_size;
240
241 /* Add space for the grid dimensions */
242 buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
243 ctx->cs_shader->kernel_param =
244 r600_compute_buffer_alloc_vram(ctx->screen,
245 buffer_size);
246 }
247
248 num_work_groups_start = ctx->ws->buffer_map(
249 ctx->cs_shader->kernel_param->cs_buf,
250 ctx->cs, PIPE_TRANSFER_WRITE);
251 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
252 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
253 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
254
255 /* Copy the work group size */
256 memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
257
258 /* Copy the global size */
259 for (i = 0; i < 3; i++) {
260 global_size_start[i] = grid_layout[i] * block_layout[i];
261 }
262
263 /* Copy the local dimensions */
264 memcpy(local_size_start, block_layout, 3 * sizeof(uint));
265
266 /* Copy the kernel inputs */
267 memcpy(kernel_parameters_start, input, ctx->cs_shader->input_size);
268
269 for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
270 (ctx->cs_shader->input_size / 4); i++) {
271 COMPUTE_DBG("input %i : %i\n", i,
272 ((unsigned*)num_work_groups_start)[i]);
273 }
274
275 ctx->ws->buffer_unmap(ctx->cs_shader->kernel_param->cs_buf);
276
277 ///ID=0 is reserved for the parameters
278 evergreen_set_vtx_resource(ctx->cs_shader,
279 ctx->cs_shader->kernel_param, 0, 0, 0);
280 ///ID=0 is reserved for parameters
281 evergreen_set_const_cache(ctx->cs_shader, 0,
282 ctx->cs_shader->kernel_param, ctx->cs_shader->input_size, 0);
283 }
284
285 void evergreen_direct_dispatch(
286 struct pipe_context *ctx_,
287 const uint *block_layout, const uint *grid_layout)
288 {
289 struct r600_context *ctx = (struct r600_context *)ctx_;
290
291 int i;
292
293 struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
294 COMPUTE_RESOURCE_DISPATCH, 0);
295
296 evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST);
297
298 evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0);
299 evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0);
300 evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0);
301
302 evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]);
303 evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]);
304 evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]);
305
306 int group_size = 1;
307
308 int grid_size = 1;
309
310 for (i = 0; i < 3; i++) {
311 group_size *= block_layout[i];
312 }
313
314 for (i = 0; i < 3; i++) {
315 grid_size *= grid_layout[i];
316 }
317
318 evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size);
319 evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size);
320
321 evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
322 evergreen_emit_raw_value(res, grid_layout[0]);
323 evergreen_emit_raw_value(res, grid_layout[1]);
324 evergreen_emit_raw_value(res, grid_layout[2]);
325 ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
326 evergreen_emit_raw_value(res, 1);
327 }
328
329 static void compute_emit_cs(struct r600_context *ctx)
330 {
331 struct radeon_winsys_cs *cs = ctx->cs;
332 int i;
333
334 r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
335
336 struct r600_resource *onebo = NULL;
337
338 for (i = 0; i < get_compute_resource_num(); i++) {
339 if (ctx->cs_shader->resources[i].enabled) {
340 int j;
341 COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
342
343 for (j = 0; j < ctx->cs_shader->resources[i].cs_end; j++) {
344 if (ctx->cs_shader->resources[i].do_reloc[j]) {
345 assert(ctx->cs_shader->resources[i].bo);
346 evergreen_emit_ctx_reloc(ctx,
347 ctx->cs_shader->resources[i].bo,
348 ctx->cs_shader->resources[i].usage);
349 }
350
351 cs->buf[cs->cdw++] = ctx->cs_shader->resources[i].cs[j];
352 }
353
354 if (ctx->cs_shader->resources[i].bo) {
355 onebo = ctx->cs_shader->resources[i].bo;
356 evergreen_emit_ctx_reloc(ctx,
357 ctx->cs_shader->resources[i].bo,
358 ctx->cs_shader->resources[i].usage);
359
360 ///special case for textures
361 if (ctx->cs_shader->resources[i].do_reloc
362 [ctx->cs_shader->resources[i].cs_end] == 2) {
363 evergreen_emit_ctx_reloc(ctx,
364 ctx->cs_shader->resources[i].bo,
365 ctx->cs_shader->resources[i].usage);
366 }
367 }
368 }
369 }
370
371 /* r600_flush_framebuffer() updates the cb_flush_flags and then
372 * calls r600_emit_atom() on the ctx->surface_sync_cmd.atom, which emits
373 * a SURFACE_SYNC packet via r600_emit_surface_sync().
374 *
375 * XXX r600_emit_surface_sync() hardcodes the CP_COHER_SIZE to
376 * 0xffffffff, so we will need to add a field to struct
377 * r600_surface_sync_cmd if we want to manually set this value.
378 */
379 r600_flush_framebuffer(ctx, true /* Flush now */);
380
381 #if 0
382 COMPUTE_DBG("cdw: %i\n", cs->cdw);
383 for (i = 0; i < cs->cdw; i++) {
384 COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
385 }
386 #endif
387
388 ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE);
389
390 ctx->pm4_dirty_cdwords = 0;
391 ctx->flags = 0;
392
393 COMPUTE_DBG("shader started\n");
394
395 ctx->ws->buffer_wait(onebo->buf, 0);
396
397 COMPUTE_DBG("...\n");
398
399 r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
400
401 ctx->streamout_start = TRUE;
402 ctx->streamout_append_bitmask = ~0;
403
404 }
405
406 static void evergreen_launch_grid(
407 struct pipe_context *ctx_,
408 const uint *block_layout, const uint *grid_layout,
409 uint32_t pc, const void *input)
410 {
411 COMPUTE_DBG("PC: %i\n", pc);
412
413 struct r600_context *ctx = (struct r600_context *)ctx_;
414 unsigned num_waves;
415 unsigned num_pipes = ctx->screen->info.r600_max_pipes;
416 unsigned wave_divisor = (16 * num_pipes);
417
418 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
419 num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
420 wave_divisor - 1) / wave_divisor;
421
422 COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
423 num_pipes, num_waves);
424
425 evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves);
426 evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
427 evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
428 compute_emit_cs(ctx);
429 }
430
431 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
432 unsigned start, unsigned count,
433 struct pipe_surface ** surfaces)
434 {
435 struct r600_context *ctx = (struct r600_context *)ctx_;
436 struct r600_surface **resources = (struct r600_surface **)surfaces;
437 for (int i = 0; i < count; i++) {
438 if (resources[i]) {
439 struct r600_resource_global *buffer =
440 (struct r600_resource_global*)resources[i]->base.texture;
441 if (resources[i]->base.writable) {
442 assert(i+1 < 12);
443 struct r600_resource_global *buffer =
444 (struct r600_resource_global*)
445 resources[i]->base.texture;
446
447 evergreen_set_rat(ctx->cs_shader, i+1,
448 (struct r600_resource *)resources[i]->base.texture,
449 buffer->chunk->start_in_dw*4,
450 resources[i]->base.texture->width0);
451 }
452
453 evergreen_set_vtx_resource(ctx->cs_shader,
454 (struct r600_resource *)resources[i]->base.texture, i+2,
455 buffer->chunk->start_in_dw*4, resources[i]->base.writable);
456 }
457 }
458
459 }
460
461 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
462 unsigned start_slot, unsigned count,
463 struct pipe_sampler_view **views)
464 {
465 struct r600_context *ctx = (struct r600_context *)ctx_;
466 struct r600_pipe_sampler_view **resource =
467 (struct r600_pipe_sampler_view **)views;
468
469 for (int i = 0; i < count; i++) {
470 if (resource[i]) {
471 assert(i+1 < 12);
472 ///FETCH0 = VTX0 (param buffer),
473 //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
474 evergreen_set_tex_resource(ctx->cs_shader, resource[i], i+2);
475 }
476 }
477 }
478
479 static void evergreen_bind_compute_sampler_states(
480 struct pipe_context *ctx_,
481 unsigned start_slot,
482 unsigned num_samplers,
483 void **samplers_)
484 {
485 struct r600_context *ctx = (struct r600_context *)ctx_;
486 struct compute_sampler_state ** samplers =
487 (struct compute_sampler_state **)samplers_;
488
489 for (int i = 0; i < num_samplers; i++) {
490 if (samplers[i]) {
491 evergreen_set_sampler_resource(ctx->cs_shader, samplers[i], i);
492 }
493 }
494 }
495
496 static void evergreen_set_global_binding(
497 struct pipe_context *ctx_, unsigned first, unsigned n,
498 struct pipe_resource **resources,
499 uint32_t **handles)
500 {
501 struct r600_context *ctx = (struct r600_context *)ctx_;
502 struct compute_memory_pool *pool = ctx->screen->global_pool;
503 struct r600_resource_global **buffers =
504 (struct r600_resource_global **)resources;
505
506 if (!resources) {
507 /* XXX: Unset */
508 return;
509 }
510
511 compute_memory_finalize_pending(pool, ctx_);
512
513 for (int i = 0; i < n; i++)
514 {
515 assert(resources[i]->target == PIPE_BUFFER);
516 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
517
518 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
519 }
520
521 evergreen_set_rat(ctx->cs_shader, 0, pool->bo, 0, pool->size_in_dw * 4);
522 evergreen_set_vtx_resource(ctx->cs_shader, pool->bo, 1, 0, 1);
523 }
524
525
526 void evergreen_compute_init_config(struct r600_context *ctx)
527 {
528 struct evergreen_compute_resource* res =
529 get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0);
530
531 int num_threads;
532 int num_stack_entries;
533 int num_temp_gprs;
534
535 enum radeon_family family;
536 unsigned tmp;
537
538 family = ctx->family;
539
540 switch (family) {
541 case CHIP_CEDAR:
542 default:
543 num_temp_gprs = 4;
544 num_threads = 128;
545 num_stack_entries = 256;
546 break;
547 case CHIP_REDWOOD:
548 num_temp_gprs = 4;
549 num_threads = 128;
550 num_stack_entries = 256;
551 break;
552 case CHIP_JUNIPER:
553 num_temp_gprs = 4;
554 num_threads = 128;
555 num_stack_entries = 512;
556 break;
557 case CHIP_CYPRESS:
558 case CHIP_HEMLOCK:
559 num_temp_gprs = 4;
560 num_threads = 128;
561 num_stack_entries = 512;
562 break;
563 case CHIP_PALM:
564 num_temp_gprs = 4;
565 num_threads = 128;
566 num_stack_entries = 256;
567 break;
568 case CHIP_SUMO:
569 num_temp_gprs = 4;
570 num_threads = 128;
571 num_stack_entries = 256;
572 break;
573 case CHIP_SUMO2:
574 num_temp_gprs = 4;
575 num_threads = 128;
576 num_stack_entries = 512;
577 break;
578 case CHIP_BARTS:
579 num_temp_gprs = 4;
580 num_threads = 128;
581 num_stack_entries = 512;
582 break;
583 case CHIP_TURKS:
584 num_temp_gprs = 4;
585 num_threads = 128;
586 num_stack_entries = 256;
587 break;
588 case CHIP_CAICOS:
589 num_temp_gprs = 4;
590 num_threads = 128;
591 num_stack_entries = 256;
592 break;
593 }
594
595 tmp = 0x00000000;
596 switch (family) {
597 case CHIP_CEDAR:
598 case CHIP_PALM:
599 case CHIP_SUMO:
600 case CHIP_SUMO2:
601 case CHIP_CAICOS:
602 break;
603 default:
604 tmp |= S_008C00_VC_ENABLE(1);
605 break;
606 }
607 tmp |= S_008C00_EXPORT_SRC_C(1);
608 tmp |= S_008C00_CS_PRIO(0);
609 tmp |= S_008C00_LS_PRIO(0);
610 tmp |= S_008C00_HS_PRIO(0);
611 tmp |= S_008C00_PS_PRIO(0);
612 tmp |= S_008C00_VS_PRIO(0);
613 tmp |= S_008C00_GS_PRIO(0);
614 tmp |= S_008C00_ES_PRIO(0);
615
616 evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp);
617
618 evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1,
619 S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs));
620 if (ctx->chip_class < CAYMAN) {
621 evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0);
622 }
623 evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0);
624 evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
625 evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
626
627 /* workaround for hw issues with dyn gpr - must set all limits to 240
628 * instead of 0, 0x1e == 240/8 */
629 if (ctx->chip_class < CAYMAN) {
630 evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
631 S_028838_PS_GPRS(0x1e) |
632 S_028838_VS_GPRS(0x1e) |
633 S_028838_GS_GPRS(0x1e) |
634 S_028838_ES_GPRS(0x1e) |
635 S_028838_HS_GPRS(0x1e) |
636 S_028838_LS_GPRS(0x1e));
637 } else {
638 evergreen_reg_set(res, 0x286f8,
639 S_028838_PS_GPRS(0x1e) |
640 S_028838_VS_GPRS(0x1e) |
641 S_028838_GS_GPRS(0x1e) |
642 S_028838_ES_GPRS(0x1e) |
643 S_028838_HS_GPRS(0x1e) |
644 S_028838_LS_GPRS(0x1e));
645 }
646
647 if (ctx->chip_class < CAYMAN) {
648
649 evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF);
650 evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF);
651 evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF);
652 evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF);
653 evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, 0xFFFFFFFF);
654 evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0);
655 tmp = S_008C1C_NUM_LS_THREADS(num_threads);
656 evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, tmp);
657 evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0);
658 evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0);
659 tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries);
660 evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, tmp);
661 }
662 evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, S_0286CC_LINEAR_GRADIENT_ENA(1));
663 evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0);
664 evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0);
665 evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0);
666 evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20);
667 tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK;
668 evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp);
669 tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1);
670 evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp);
671 evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
672 evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0);
673 evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0);
674 evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, S_028000_COLOR_DISABLE(1));
675 evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0);
676 evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
677 S_0286E8_TID_IN_GROUP_ENA
678 | S_0286E8_TGID_ENA
679 | S_0286E8_DISABLE_INDEX_PACK)
680 ;
681 }
682
683 void evergreen_init_compute_state_functions(struct r600_context *ctx)
684 {
685 ctx->context.create_compute_state = evergreen_create_compute_state;
686 ctx->context.delete_compute_state = evergreen_delete_compute_state;
687 ctx->context.bind_compute_state = evergreen_bind_compute_state;
688 // ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
689 ctx->context.set_compute_resources = evergreen_set_compute_resources;
690 ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
691 ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
692 ctx->context.set_global_binding = evergreen_set_global_binding;
693 ctx->context.launch_grid = evergreen_launch_grid;
694 }
695
696
697 struct pipe_resource *r600_compute_global_buffer_create(
698 struct pipe_screen *screen,
699 const struct pipe_resource *templ)
700 {
701 assert(templ->target == PIPE_BUFFER);
702 assert(templ->bind & PIPE_BIND_GLOBAL);
703 assert(templ->array_size == 1 || templ->array_size == 0);
704 assert(templ->depth0 == 1 || templ->depth0 == 0);
705 assert(templ->height0 == 1 || templ->height0 == 0);
706
707 struct r600_resource_global* result = (struct r600_resource_global*)
708 CALLOC(sizeof(struct r600_resource_global), 1);
709 struct r600_screen* rscreen = (struct r600_screen*)screen;
710
711 result->base.b.vtbl = &r600_global_buffer_vtbl;
712 result->base.b.b.screen = screen;
713 result->base.b.b = *templ;
714 pipe_reference_init(&result->base.b.b.reference, 1);
715
716 int size_in_dw = (templ->width0+3) / 4;
717
718 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
719
720 if (result->chunk == NULL)
721 {
722 free(result);
723 return NULL;
724 }
725
726 return &result->base.b.b;
727 }
728
729 void r600_compute_global_buffer_destroy(
730 struct pipe_screen *screen,
731 struct pipe_resource *res)
732 {
733 assert(res->target == PIPE_BUFFER);
734 assert(res->bind & PIPE_BIND_GLOBAL);
735
736 struct r600_resource_global* buffer = (struct r600_resource_global*)res;
737 struct r600_screen* rscreen = (struct r600_screen*)screen;
738
739 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
740
741 buffer->chunk = NULL;
742 free(res);
743 }
744
745 void* r600_compute_global_transfer_map(
746 struct pipe_context *ctx_,
747 struct pipe_transfer* transfer)
748 {
749 assert(transfer->resource->target == PIPE_BUFFER);
750 assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
751 assert(transfer->box.x >= 0);
752 assert(transfer->box.y == 0);
753 assert(transfer->box.z == 0);
754
755 struct r600_context *ctx = (struct r600_context *)ctx_;
756 struct r600_resource_global* buffer =
757 (struct r600_resource_global*)transfer->resource;
758
759 uint32_t* map;
760 ///TODO: do it better, mapping is not possible if the pool is too big
761
762 if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
763 ctx->cs, transfer->usage))) {
764 return NULL;
765 }
766
767 COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
768 return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
769 }
770
771 void r600_compute_global_transfer_unmap(
772 struct pipe_context *ctx_,
773 struct pipe_transfer* transfer)
774 {
775 assert(transfer->resource->target == PIPE_BUFFER);
776 assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
777
778 struct r600_context *ctx = (struct r600_context *)ctx_;
779 struct r600_resource_global* buffer =
780 (struct r600_resource_global*)transfer->resource;
781
782 ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
783 }
784
785 struct pipe_transfer * r600_compute_global_get_transfer(
786 struct pipe_context *ctx_,
787 struct pipe_resource *resource,
788 unsigned level,
789 unsigned usage,
790 const struct pipe_box *box)
791 {
792 struct r600_context *ctx = (struct r600_context *)ctx_;
793 struct compute_memory_pool *pool = ctx->screen->global_pool;
794
795 compute_memory_finalize_pending(pool, ctx_);
796
797 assert(resource->target == PIPE_BUFFER);
798 struct r600_context *rctx = (struct r600_context*)ctx_;
799 struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
800
801 transfer->resource = resource;
802 transfer->level = level;
803 transfer->usage = usage;
804 transfer->box = *box;
805 transfer->stride = 0;
806 transfer->layer_stride = 0;
807 transfer->data = NULL;
808
809 /* Note strides are zero, this is ok for buffers, but not for
810 * textures 2d & higher at least.
811 */
812 return transfer;
813 }
814
815 void r600_compute_global_transfer_destroy(
816 struct pipe_context *ctx_,
817 struct pipe_transfer *transfer)
818 {
819 struct r600_context *rctx = (struct r600_context*)ctx_;
820 util_slab_free(&rctx->pool_transfers, transfer);
821 }
822
823 void r600_compute_global_transfer_flush_region(
824 struct pipe_context *ctx_,
825 struct pipe_transfer *transfer,
826 const struct pipe_box *box)
827 {
828 assert(0 && "TODO");
829 }
830
831 void r600_compute_global_transfer_inline_write(
832 struct pipe_context *pipe,
833 struct pipe_resource *resource,
834 unsigned level,
835 unsigned usage,
836 const struct pipe_box *box,
837 const void *data,
838 unsigned stride,
839 unsigned layer_stride)
840 {
841 assert(0 && "TODO");
842 }