panfrost/midgard: Add umin/umax opcodes
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #ifdef HAVE_OPENCL
28 #include <gelf.h>
29 #include <libelf.h>
30 #endif
31 #include <stdio.h>
32 #include <errno.h>
33 #include "pipe/p_defines.h"
34 #include "pipe/p_state.h"
35 #include "pipe/p_context.h"
36 #include "util/u_blitter.h"
37 #include "util/list.h"
38 #include "util/u_transfer.h"
39 #include "util/u_surface.h"
40 #include "util/u_pack_color.h"
41 #include "util/u_memory.h"
42 #include "util/u_inlines.h"
43 #include "util/u_framebuffer.h"
44 #include "tgsi/tgsi_parse.h"
45 #include "pipebuffer/pb_buffer.h"
46 #include "evergreend.h"
47 #include "r600_shader.h"
48 #include "r600_pipe.h"
49 #include "r600_formats.h"
50 #include "evergreen_compute.h"
51 #include "evergreen_compute_internal.h"
52 #include "compute_memory_pool.h"
53 #include "sb/sb_public.h"
54 #include <inttypes.h>
55
56 /**
57 RAT0 is for global binding write
58 VTX1 is for global binding read
59
60 for wrting images RAT1...
61 for reading images TEX2...
62 TEX2-RAT1 is paired
63
64 TEX2... consumes the same fetch resources, that VTX2... would consume
65
66 CONST0 and VTX0 is for parameters
67 CONST0 is binding smaller input parameter buffer, and for constant indexing,
68 also constant cached
69 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
70 the constant cache can handle
71
72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
74 we should reserve another one too.=> 10 image binding for writing max.
75
76 from Nvidia OpenCL:
77 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
78 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
79
80 so 10 for writing is enough. 176 is the max for reading according to the docs
81
82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
83 writable images will consume TEX slots, VTX slots too because of linear indexing
84
85 */
86
87 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
88 unsigned size)
89 {
90 struct pipe_resource *buffer = NULL;
91 assert(size);
92
93 buffer = pipe_buffer_create((struct pipe_screen*) screen,
94 0, PIPE_USAGE_IMMUTABLE, size);
95
96 return (struct r600_resource *)buffer;
97 }
98
99
100 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
101 unsigned id,
102 struct r600_resource *bo,
103 int start,
104 int size)
105 {
106 struct pipe_surface rat_templ;
107 struct r600_surface *surf = NULL;
108 struct r600_context *rctx = NULL;
109
110 assert(id < 12);
111 assert((size & 3) == 0);
112 assert((start & 0xFF) == 0);
113
114 rctx = pipe->ctx;
115
116 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
117
118 /* Create the RAT surface */
119 memset(&rat_templ, 0, sizeof(rat_templ));
120 rat_templ.format = PIPE_FORMAT_R32_UINT;
121 rat_templ.u.tex.level = 0;
122 rat_templ.u.tex.first_layer = 0;
123 rat_templ.u.tex.last_layer = 0;
124
125 /* Add the RAT the list of color buffers. Drop the old buffer first. */
126 pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
127 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
128 (struct pipe_context *)pipe->ctx,
129 (struct pipe_resource *)bo, &rat_templ);
130
131 /* Update the number of color buffers */
132 pipe->ctx->framebuffer.state.nr_cbufs =
133 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
134
135 /* Update the cb_target_mask
136 * XXX: I think this is a potential spot for bugs once we start doing
137 * GL interop. cb_target_mask may be modified in the 3D sections
138 * of this driver. */
139 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
140
141 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
142 evergreen_init_color_surface_rat(rctx, surf);
143 }
144
145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
146 unsigned vb_index,
147 unsigned offset,
148 struct pipe_resource *buffer)
149 {
150 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
151 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
152 vb->stride = 1;
153 vb->buffer_offset = offset;
154 vb->buffer.resource = buffer;
155 vb->is_user_buffer = false;
156
157 /* The vertex instructions in the compute shaders use the texture cache,
158 * so we need to invalidate it. */
159 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
160 state->enabled_mask |= 1 << vb_index;
161 state->dirty_mask |= 1 << vb_index;
162 r600_mark_atom_dirty(rctx, &state->atom);
163 }
164
165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
166 unsigned cb_index,
167 unsigned offset,
168 unsigned size,
169 struct pipe_resource *buffer)
170 {
171 struct pipe_constant_buffer cb;
172 cb.buffer_size = size;
173 cb.buffer_offset = offset;
174 cb.buffer = buffer;
175 cb.user_buffer = NULL;
176
177 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
178 }
179
180 /* We need to define these R600 registers here, because we can't include
181 * evergreend.h and r600d.h.
182 */
183 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
184 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
185
186 #ifdef HAVE_OPENCL
187 static void parse_symbol_table(Elf_Data *symbol_table_data,
188 const GElf_Shdr *symbol_table_header,
189 struct ac_shader_binary *binary)
190 {
191 GElf_Sym symbol;
192 unsigned i = 0;
193 unsigned symbol_count =
194 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
195
196 /* We are over allocating this list, because symbol_count gives the
197 * total number of symbols, and we will only be filling the list
198 * with offsets of global symbols. The memory savings from
199 * allocating the correct size of this list will be small, and
200 * I don't think it is worth the cost of pre-computing the number
201 * of global symbols.
202 */
203 binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
204
205 while (gelf_getsym(symbol_table_data, i++, &symbol)) {
206 unsigned i;
207 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
208 symbol.st_shndx == 0 /* Undefined symbol */) {
209 continue;
210 }
211
212 binary->global_symbol_offsets[binary->global_symbol_count] =
213 symbol.st_value;
214
215 /* Sort the list using bubble sort. This list will usually
216 * be small. */
217 for (i = binary->global_symbol_count; i > 0; --i) {
218 uint64_t lhs = binary->global_symbol_offsets[i - 1];
219 uint64_t rhs = binary->global_symbol_offsets[i];
220 if (lhs < rhs) {
221 break;
222 }
223 binary->global_symbol_offsets[i] = lhs;
224 binary->global_symbol_offsets[i - 1] = rhs;
225 }
226 ++binary->global_symbol_count;
227 }
228 }
229
230
231 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
232 unsigned symbol_sh_link,
233 struct ac_shader_binary *binary)
234 {
235 unsigned i;
236
237 if (!relocs || !symbols || !binary->reloc_count) {
238 return;
239 }
240 binary->relocs = CALLOC(binary->reloc_count,
241 sizeof(struct ac_shader_reloc));
242 for (i = 0; i < binary->reloc_count; i++) {
243 GElf_Sym symbol;
244 GElf_Rel rel;
245 char *symbol_name;
246 struct ac_shader_reloc *reloc = &binary->relocs[i];
247
248 gelf_getrel(relocs, i, &rel);
249 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
250 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
251
252 reloc->offset = rel.r_offset;
253 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
254 reloc->name[sizeof(reloc->name)-1] = 0;
255 }
256 }
257
258 static void r600_elf_read(const char *elf_data, unsigned elf_size,
259 struct ac_shader_binary *binary)
260 {
261 char *elf_buffer;
262 Elf *elf;
263 Elf_Scn *section = NULL;
264 Elf_Data *symbols = NULL, *relocs = NULL;
265 size_t section_str_index;
266 unsigned symbol_sh_link = 0;
267
268 /* One of the libelf implementations
269 * (http://www.mr511.de/software/english.htm) requires calling
270 * elf_version() before elf_memory().
271 */
272 elf_version(EV_CURRENT);
273 elf_buffer = MALLOC(elf_size);
274 memcpy(elf_buffer, elf_data, elf_size);
275
276 elf = elf_memory(elf_buffer, elf_size);
277
278 elf_getshdrstrndx(elf, &section_str_index);
279
280 while ((section = elf_nextscn(elf, section))) {
281 const char *name;
282 Elf_Data *section_data = NULL;
283 GElf_Shdr section_header;
284 if (gelf_getshdr(section, &section_header) != &section_header) {
285 fprintf(stderr, "Failed to read ELF section header\n");
286 return;
287 }
288 name = elf_strptr(elf, section_str_index, section_header.sh_name);
289 if (!strcmp(name, ".text")) {
290 section_data = elf_getdata(section, section_data);
291 binary->code_size = section_data->d_size;
292 binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
293 memcpy(binary->code, section_data->d_buf, binary->code_size);
294 } else if (!strcmp(name, ".AMDGPU.config")) {
295 section_data = elf_getdata(section, section_data);
296 binary->config_size = section_data->d_size;
297 binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
298 memcpy(binary->config, section_data->d_buf, binary->config_size);
299 } else if (!strcmp(name, ".AMDGPU.disasm")) {
300 /* Always read disassembly if it's available. */
301 section_data = elf_getdata(section, section_data);
302 binary->disasm_string = strndup(section_data->d_buf,
303 section_data->d_size);
304 } else if (!strncmp(name, ".rodata", 7)) {
305 section_data = elf_getdata(section, section_data);
306 binary->rodata_size = section_data->d_size;
307 binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
308 memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
309 } else if (!strncmp(name, ".symtab", 7)) {
310 symbols = elf_getdata(section, section_data);
311 symbol_sh_link = section_header.sh_link;
312 parse_symbol_table(symbols, &section_header, binary);
313 } else if (!strcmp(name, ".rel.text")) {
314 relocs = elf_getdata(section, section_data);
315 binary->reloc_count = section_header.sh_size /
316 section_header.sh_entsize;
317 }
318 }
319
320 parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
321
322 if (elf){
323 elf_end(elf);
324 }
325 FREE(elf_buffer);
326
327 /* Cache the config size per symbol */
328 if (binary->global_symbol_count) {
329 binary->config_size_per_symbol =
330 binary->config_size / binary->global_symbol_count;
331 } else {
332 binary->global_symbol_count = 1;
333 binary->config_size_per_symbol = binary->config_size;
334 }
335 }
336
337 static const unsigned char *r600_shader_binary_config_start(
338 const struct ac_shader_binary *binary,
339 uint64_t symbol_offset)
340 {
341 unsigned i;
342 for (i = 0; i < binary->global_symbol_count; ++i) {
343 if (binary->global_symbol_offsets[i] == symbol_offset) {
344 unsigned offset = i * binary->config_size_per_symbol;
345 return binary->config + offset;
346 }
347 }
348 return binary->config;
349 }
350
351 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
352 struct r600_bytecode *bc,
353 uint64_t symbol_offset,
354 boolean *use_kill)
355 {
356 unsigned i;
357 const unsigned char *config =
358 r600_shader_binary_config_start(binary, symbol_offset);
359
360 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
361 unsigned reg =
362 util_le32_to_cpu(*(uint32_t*)(config + i));
363 unsigned value =
364 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
365 switch (reg) {
366 /* R600 / R700 */
367 case R_028850_SQ_PGM_RESOURCES_PS:
368 case R_028868_SQ_PGM_RESOURCES_VS:
369 /* Evergreen / Northern Islands */
370 case R_028844_SQ_PGM_RESOURCES_PS:
371 case R_028860_SQ_PGM_RESOURCES_VS:
372 case R_0288D4_SQ_PGM_RESOURCES_LS:
373 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
374 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
375 break;
376 case R_02880C_DB_SHADER_CONTROL:
377 *use_kill = G_02880C_KILL_ENABLE(value);
378 break;
379 case R_0288E8_SQ_LDS_ALLOC:
380 bc->nlds_dw = value;
381 break;
382 }
383 }
384 }
385
386 static unsigned r600_create_shader(struct r600_bytecode *bc,
387 const struct ac_shader_binary *binary,
388 boolean *use_kill)
389
390 {
391 assert(binary->code_size % 4 == 0);
392 bc->bytecode = CALLOC(1, binary->code_size);
393 memcpy(bc->bytecode, binary->code, binary->code_size);
394 bc->ndw = binary->code_size / 4;
395
396 r600_shader_binary_read_config(binary, bc, 0, use_kill);
397 return 0;
398 }
399
400 #endif
401
402 static void r600_destroy_shader(struct r600_bytecode *bc)
403 {
404 FREE(bc->bytecode);
405 }
406
407 static void *evergreen_create_compute_state(struct pipe_context *ctx,
408 const struct pipe_compute_state *cso)
409 {
410 struct r600_context *rctx = (struct r600_context *)ctx;
411 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
412 #ifdef HAVE_OPENCL
413 const struct pipe_llvm_program_header *header;
414 const char *code;
415 void *p;
416 boolean use_kill;
417 #endif
418
419 shader->ctx = rctx;
420 shader->local_size = cso->req_local_mem;
421 shader->private_size = cso->req_private_mem;
422 shader->input_size = cso->req_input_mem;
423
424 shader->ir_type = cso->ir_type;
425
426 if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
427 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
428 return shader;
429 }
430 #ifdef HAVE_OPENCL
431 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
432 header = cso->prog;
433 code = cso->prog + sizeof(struct pipe_llvm_program_header);
434 radeon_shader_binary_init(&shader->binary);
435 r600_elf_read(code, header->num_bytes, &shader->binary);
436 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
437
438 /* Upload code + ROdata */
439 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
440 shader->bc.ndw * 4);
441 p = r600_buffer_map_sync_with_rings(
442 &rctx->b, shader->code_bo,
443 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
444 //TODO: use util_memcpy_cpu_to_le32 ?
445 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
446 rctx->b.ws->buffer_unmap(shader->code_bo->buf);
447 #endif
448
449 return shader;
450 }
451
452 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
453 {
454 struct r600_context *rctx = (struct r600_context *)ctx;
455 struct r600_pipe_compute *shader = state;
456
457 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
458
459 if (!shader)
460 return;
461
462 if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
463 r600_delete_shader_selector(ctx, shader->sel);
464 } else {
465 #ifdef HAVE_OPENCL
466 radeon_shader_binary_clean(&shader->binary);
467 pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
468 pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
469 #endif
470 r600_destroy_shader(&shader->bc);
471 }
472 FREE(shader);
473 }
474
475 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
476 {
477 struct r600_context *rctx = (struct r600_context *)ctx;
478 struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
479 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
480
481 if (!state) {
482 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
483 return;
484 }
485
486 if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
487 bool compute_dirty;
488
489 r600_shader_select(ctx, cstate->sel, &compute_dirty);
490 }
491
492 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
493 }
494
495 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
496 * kernel parameters there are implicit parameters that need to be stored
497 * in the vertex buffer as well. Here is how these parameters are organized in
498 * the buffer:
499 *
500 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
501 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
502 * DWORDS 6-8: Number of work items within each work group in each dimension
503 * (x,y,z)
504 * DWORDS 9+ : Kernel parameters
505 */
506 static void evergreen_compute_upload_input(struct pipe_context *ctx,
507 const struct pipe_grid_info *info)
508 {
509 struct r600_context *rctx = (struct r600_context *)ctx;
510 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
511 unsigned i;
512 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
513 * parameters.
514 */
515 unsigned input_size;
516 uint32_t *num_work_groups_start;
517 uint32_t *global_size_start;
518 uint32_t *local_size_start;
519 uint32_t *kernel_parameters_start;
520 struct pipe_box box;
521 struct pipe_transfer *transfer = NULL;
522
523 if (!shader)
524 return;
525 if (shader->input_size == 0) {
526 return;
527 }
528 input_size = shader->input_size + 36;
529 if (!shader->kernel_param) {
530 /* Add space for the grid dimensions */
531 shader->kernel_param = (struct r600_resource *)
532 pipe_buffer_create(ctx->screen, 0,
533 PIPE_USAGE_IMMUTABLE, input_size);
534 }
535
536 u_box_1d(0, input_size, &box);
537 num_work_groups_start = ctx->transfer_map(ctx,
538 (struct pipe_resource*)shader->kernel_param,
539 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
540 &box, &transfer);
541 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
542 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
543 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
544
545 /* Copy the work group size */
546 memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
547
548 /* Copy the global size */
549 for (i = 0; i < 3; i++) {
550 global_size_start[i] = info->grid[i] * info->block[i];
551 }
552
553 /* Copy the local dimensions */
554 memcpy(local_size_start, info->block, 3 * sizeof(uint));
555
556 /* Copy the kernel inputs */
557 memcpy(kernel_parameters_start, info->input, shader->input_size);
558
559 for (i = 0; i < (input_size / 4); i++) {
560 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
561 ((unsigned*)num_work_groups_start)[i]);
562 }
563
564 ctx->transfer_unmap(ctx, transfer);
565
566 /* ID=0 and ID=3 are reserved for the parameters.
567 * LLVM will preferably use ID=0, but it does not work for dynamic
568 * indices. */
569 evergreen_cs_set_vertex_buffer(rctx, 3, 0,
570 (struct pipe_resource*)shader->kernel_param);
571 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
572 (struct pipe_resource*)shader->kernel_param);
573 }
574
575 static void evergreen_emit_dispatch(struct r600_context *rctx,
576 const struct pipe_grid_info *info,
577 uint32_t indirect_grid[3])
578 {
579 int i;
580 struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
581 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
582 bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
583 unsigned num_waves;
584 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
585 unsigned wave_divisor = (16 * num_pipes);
586 int group_size = 1;
587 int grid_size = 1;
588 unsigned lds_size = shader->local_size / 4;
589
590 if (shader->ir_type != PIPE_SHADER_IR_TGSI)
591 lds_size += shader->bc.nlds_dw;
592
593 /* Calculate group_size/grid_size */
594 for (i = 0; i < 3; i++) {
595 group_size *= info->block[i];
596 }
597
598 for (i = 0; i < 3; i++) {
599 grid_size *= info->grid[i];
600 }
601
602 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
603 num_waves = (info->block[0] * info->block[1] * info->block[2] +
604 wave_divisor - 1) / wave_divisor;
605
606 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
607 "%u wavefronts per thread block, "
608 "allocating %u dwords lds.\n",
609 num_pipes, num_waves, lds_size);
610
611 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
612
613 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
614 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
615 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
616 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
617
618 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
619 group_size);
620
621 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
622 radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
623 radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
624 radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
625
626 if (rctx->b.chip_class < CAYMAN) {
627 assert(lds_size <= 8192);
628 } else {
629 /* Cayman appears to have a slightly smaller limit, see the
630 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
631 assert(lds_size <= 8160);
632 }
633
634 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
635 lds_size | (num_waves << 14));
636
637 if (info->indirect) {
638 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
639 radeon_emit(cs, indirect_grid[0]);
640 radeon_emit(cs, indirect_grid[1]);
641 radeon_emit(cs, indirect_grid[2]);
642 radeon_emit(cs, 1);
643 } else {
644 /* Dispatch packet */
645 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
646 radeon_emit(cs, info->grid[0]);
647 radeon_emit(cs, info->grid[1]);
648 radeon_emit(cs, info->grid[2]);
649 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
650 radeon_emit(cs, 1);
651 }
652
653 if (rctx->is_debug)
654 eg_trace_emit(rctx);
655 }
656
657 static void compute_setup_cbs(struct r600_context *rctx)
658 {
659 struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
660 unsigned i;
661
662 /* Emit colorbuffers. */
663 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
664 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
665 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
666 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
667 (struct r600_resource*)cb->base.texture,
668 RADEON_USAGE_READWRITE,
669 RADEON_PRIO_SHADER_RW_BUFFER);
670
671 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
672 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
673 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
674 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
675 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
676 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
677 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
678 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
679
680 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
681 radeon_emit(cs, reloc);
682
683 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
684 radeon_emit(cs, reloc);
685 }
686 for (; i < 8 ; i++)
687 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
688 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
689 for (; i < 12; i++)
690 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
691 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
692
693 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
694 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
695 rctx->compute_cb_target_mask);
696 }
697
698 static void compute_emit_cs(struct r600_context *rctx,
699 const struct pipe_grid_info *info)
700 {
701 struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
702 bool compute_dirty = false;
703 struct r600_pipe_shader *current;
704 struct r600_shader_atomic combined_atomics[8];
705 uint8_t atomic_used_mask;
706 uint32_t indirect_grid[3] = { 0, 0, 0 };
707
708 /* make sure that the gfx ring is only one active */
709 if (radeon_emitted(rctx->b.dma.cs, 0)) {
710 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
711 }
712
713 r600_update_compressed_resource_state(rctx, true);
714
715 if (!rctx->cmd_buf_is_compute) {
716 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
717 rctx->cmd_buf_is_compute = true;
718 }
719
720 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
721 r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
722 current = rctx->cs_shader_state.shader->sel->current;
723 if (compute_dirty) {
724 rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
725 r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
726 r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
727 }
728
729 bool need_buf_const = current->shader.uses_tex_buffers ||
730 current->shader.has_txq_cube_array_z_comp;
731
732 if (info->indirect) {
733 struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
734 unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
735 unsigned offset = info->indirect_offset / 4;
736 indirect_grid[0] = data[offset];
737 indirect_grid[1] = data[offset + 1];
738 indirect_grid[2] = data[offset + 2];
739 }
740 for (int i = 0; i < 3; i++) {
741 rctx->cs_block_grid_sizes[i] = info->block[i];
742 rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
743 }
744 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
745 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
746
747 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
748 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
749
750 if (need_buf_const) {
751 eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
752 }
753 r600_update_driver_const_buffers(rctx, true);
754
755 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
756 if (atomic_used_mask) {
757 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
758 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
759 }
760 } else
761 r600_need_cs_space(rctx, 0, true, 0);
762
763 /* Initialize all the compute-related registers.
764 *
765 * See evergreen_init_atom_start_compute_cs() in this file for the list
766 * of registers initialized by the start_compute_cs_cmd atom.
767 */
768 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
769
770 /* emit config state */
771 if (rctx->b.chip_class == EVERGREEN) {
772 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
773 radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
774 radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
775 radeon_emit(cs, 0);
776 radeon_emit(cs, 0);
777 radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
778 } else
779 r600_emit_atom(rctx, &rctx->config_state.atom);
780 }
781
782 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
783 r600_flush_emit(rctx);
784
785 if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
786
787 compute_setup_cbs(rctx);
788
789 /* Emit vertex buffer state */
790 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
791 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
792 } else {
793 uint32_t rat_mask;
794
795 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
796 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
797 rat_mask);
798 }
799
800 r600_emit_atom(rctx, &rctx->b.render_cond_atom);
801
802 /* Emit constant buffer state */
803 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
804
805 /* Emit sampler state */
806 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
807
808 /* Emit sampler view (texture resource) state */
809 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
810
811 /* Emit images state */
812 r600_emit_atom(rctx, &rctx->compute_images.atom);
813
814 /* Emit buffers state */
815 r600_emit_atom(rctx, &rctx->compute_buffers.atom);
816
817 /* Emit shader state */
818 r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
819
820 /* Emit dispatch state and dispatch packet */
821 evergreen_emit_dispatch(rctx, info, indirect_grid);
822
823 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
824 */
825 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
826 R600_CONTEXT_INV_VERTEX_CACHE |
827 R600_CONTEXT_INV_TEX_CACHE;
828 r600_flush_emit(rctx);
829 rctx->b.flags = 0;
830
831 if (rctx->b.chip_class >= CAYMAN) {
832 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
833 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
834 /* DEALLOC_STATE prevents the GPU from hanging when a
835 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
836 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
837 */
838 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
839 radeon_emit(cs, 0);
840 }
841 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
842 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
843
844 #if 0
845 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
846 for (i = 0; i < cs->cdw; i++) {
847 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
848 }
849 #endif
850
851 }
852
853
854 /**
855 * Emit function for r600_cs_shader_state atom
856 */
857 void evergreen_emit_cs_shader(struct r600_context *rctx,
858 struct r600_atom *atom)
859 {
860 struct r600_cs_shader_state *state =
861 (struct r600_cs_shader_state*)atom;
862 struct r600_pipe_compute *shader = state->shader;
863 struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
864 uint64_t va;
865 struct r600_resource *code_bo;
866 unsigned ngpr, nstack;
867
868 if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
869 code_bo = shader->sel->current->bo;
870 va = shader->sel->current->bo->gpu_address;
871 ngpr = shader->sel->current->shader.bc.ngpr;
872 nstack = shader->sel->current->shader.bc.nstack;
873 } else {
874 code_bo = shader->code_bo;
875 va = shader->code_bo->gpu_address + state->pc;
876 ngpr = shader->bc.ngpr;
877 nstack = shader->bc.nstack;
878 }
879
880 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
881 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
882 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
883 S_0288D4_NUM_GPRS(ngpr) |
884 S_0288D4_DX10_CLAMP(1) |
885 S_0288D4_STACK_SIZE(nstack));
886 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
887
888 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
889 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
890 code_bo, RADEON_USAGE_READ,
891 RADEON_PRIO_SHADER_BINARY));
892 }
893
894 static void evergreen_launch_grid(struct pipe_context *ctx,
895 const struct pipe_grid_info *info)
896 {
897 struct r600_context *rctx = (struct r600_context *)ctx;
898 #ifdef HAVE_OPENCL
899 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
900 boolean use_kill;
901
902 if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
903 rctx->cs_shader_state.pc = info->pc;
904 /* Get the config information for this kernel. */
905 r600_shader_binary_read_config(&shader->binary, &shader->bc,
906 info->pc, &use_kill);
907 } else {
908 use_kill = false;
909 rctx->cs_shader_state.pc = 0;
910 }
911 #endif
912
913 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
914
915
916 evergreen_compute_upload_input(ctx, info);
917 compute_emit_cs(rctx, info);
918 }
919
920 static void evergreen_set_compute_resources(struct pipe_context *ctx,
921 unsigned start, unsigned count,
922 struct pipe_surface **surfaces)
923 {
924 struct r600_context *rctx = (struct r600_context *)ctx;
925 struct r600_surface **resources = (struct r600_surface **)surfaces;
926
927 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
928 start, count);
929
930 for (unsigned i = 0; i < count; i++) {
931 /* The First four vertex buffers are reserved for parameters and
932 * global buffers. */
933 unsigned vtx_id = 4 + i;
934 if (resources[i]) {
935 struct r600_resource_global *buffer =
936 (struct r600_resource_global*)
937 resources[i]->base.texture;
938 if (resources[i]->base.writable) {
939 assert(i+1 < 12);
940
941 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
942 (struct r600_resource *)resources[i]->base.texture,
943 buffer->chunk->start_in_dw*4,
944 resources[i]->base.texture->width0);
945 }
946
947 evergreen_cs_set_vertex_buffer(rctx, vtx_id,
948 buffer->chunk->start_in_dw * 4,
949 resources[i]->base.texture);
950 }
951 }
952 }
953
954 static void evergreen_set_global_binding(struct pipe_context *ctx,
955 unsigned first, unsigned n,
956 struct pipe_resource **resources,
957 uint32_t **handles)
958 {
959 struct r600_context *rctx = (struct r600_context *)ctx;
960 struct compute_memory_pool *pool = rctx->screen->global_pool;
961 struct r600_resource_global **buffers =
962 (struct r600_resource_global **)resources;
963 unsigned i;
964
965 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
966 first, n);
967
968 if (!resources) {
969 /* XXX: Unset */
970 return;
971 }
972
973 /* We mark these items for promotion to the pool if they
974 * aren't already there */
975 for (i = first; i < first + n; i++) {
976 struct compute_memory_item *item = buffers[i]->chunk;
977
978 if (!is_item_in_pool(item))
979 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
980 }
981
982 if (compute_memory_finalize_pending(pool, ctx) == -1) {
983 /* XXX: Unset */
984 return;
985 }
986
987 for (i = first; i < first + n; i++)
988 {
989 uint32_t buffer_offset;
990 uint32_t handle;
991 assert(resources[i]->target == PIPE_BUFFER);
992 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
993
994 buffer_offset = util_le32_to_cpu(*(handles[i]));
995 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
996
997 *(handles[i]) = util_cpu_to_le32(handle);
998 }
999
1000 /* globals for writing */
1001 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1002 /* globals for reading */
1003 evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1004 (struct pipe_resource*)pool->bo);
1005
1006 /* constants for reading, LLVM puts them in text segment */
1007 evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1008 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1009 }
1010
1011 /**
1012 * This function initializes all the compute specific registers that need to
1013 * be initialized for each compute command stream. Registers that are common
1014 * to both compute and 3D will be initialized at the beginning of each compute
1015 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
1016 * packet requires that the shader type bit be set, we must initialize all
1017 * context registers needed for compute in this function. The registers
1018 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1019 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1020 * on the GPU family.
1021 */
1022 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1023 {
1024 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1025 int num_threads;
1026 int num_stack_entries;
1027
1028 /* since all required registers are initialized in the
1029 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1030 */
1031 r600_init_command_buffer(cb, 256);
1032 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1033
1034 /* We're setting config registers here. */
1035 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1036 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1037
1038 switch (rctx->b.family) {
1039 case CHIP_CEDAR:
1040 default:
1041 num_threads = 128;
1042 num_stack_entries = 256;
1043 break;
1044 case CHIP_REDWOOD:
1045 num_threads = 128;
1046 num_stack_entries = 256;
1047 break;
1048 case CHIP_JUNIPER:
1049 num_threads = 128;
1050 num_stack_entries = 512;
1051 break;
1052 case CHIP_CYPRESS:
1053 case CHIP_HEMLOCK:
1054 num_threads = 128;
1055 num_stack_entries = 512;
1056 break;
1057 case CHIP_PALM:
1058 num_threads = 128;
1059 num_stack_entries = 256;
1060 break;
1061 case CHIP_SUMO:
1062 num_threads = 128;
1063 num_stack_entries = 256;
1064 break;
1065 case CHIP_SUMO2:
1066 num_threads = 128;
1067 num_stack_entries = 512;
1068 break;
1069 case CHIP_BARTS:
1070 num_threads = 128;
1071 num_stack_entries = 512;
1072 break;
1073 case CHIP_TURKS:
1074 num_threads = 128;
1075 num_stack_entries = 256;
1076 break;
1077 case CHIP_CAICOS:
1078 num_threads = 128;
1079 num_stack_entries = 256;
1080 break;
1081 }
1082
1083 /* The primitive type always needs to be POINTLIST for compute. */
1084 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1085 V_008958_DI_PT_POINTLIST);
1086
1087 if (rctx->b.chip_class < CAYMAN) {
1088
1089 /* These registers control which simds can be used by each stage.
1090 * The default for these registers is 0xffffffff, which means
1091 * all simds are available for each stage. It's possible we may
1092 * want to play around with these in the future, but for now
1093 * the default value is fine.
1094 *
1095 * R_008E20_SQ_STATIC_THREAD_MGMT1
1096 * R_008E24_SQ_STATIC_THREAD_MGMT2
1097 * R_008E28_SQ_STATIC_THREAD_MGMT3
1098 */
1099
1100 /* XXX: We may need to adjust the thread and stack resource
1101 * values for 3D/compute interop */
1102
1103 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1104
1105 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1106 * Set the number of threads used by the PS/VS/GS/ES stage to
1107 * 0.
1108 */
1109 r600_store_value(cb, 0);
1110
1111 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1112 * Set the number of threads used by the CS (aka LS) stage to
1113 * the maximum number of threads and set the number of threads
1114 * for the HS stage to 0. */
1115 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1116
1117 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1118 * Set the Control Flow stack entries to 0 for PS/VS stages */
1119 r600_store_value(cb, 0);
1120
1121 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1122 * Set the Control Flow stack entries to 0 for GS/ES stages */
1123 r600_store_value(cb, 0);
1124
1125 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1126 * Set the Contol Flow stack entries to 0 for the HS stage, and
1127 * set it to the maximum value for the CS (aka LS) stage. */
1128 r600_store_value(cb,
1129 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1130 }
1131 /* Give the compute shader all the available LDS space.
1132 * NOTE: This only sets the maximum number of dwords that a compute
1133 * shader can allocate. When a shader is executed, we still need to
1134 * allocate the appropriate amount of LDS dwords using the
1135 * CM_R_0288E8_SQ_LDS_ALLOC register.
1136 */
1137 if (rctx->b.chip_class < CAYMAN) {
1138 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1139 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1140 } else {
1141 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1142 S_0286FC_NUM_PS_LDS(0) |
1143 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1144 }
1145
1146 /* Context Registers */
1147
1148 if (rctx->b.chip_class < CAYMAN) {
1149 /* workaround for hw issues with dyn gpr - must set all limits
1150 * to 240 instead of 0, 0x1e == 240 / 8
1151 */
1152 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1153 S_028838_PS_GPRS(0x1e) |
1154 S_028838_VS_GPRS(0x1e) |
1155 S_028838_GS_GPRS(0x1e) |
1156 S_028838_ES_GPRS(0x1e) |
1157 S_028838_HS_GPRS(0x1e) |
1158 S_028838_LS_GPRS(0x1e));
1159 }
1160
1161 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1162 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1163 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1164
1165 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1166
1167 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1168 S_0286E8_TID_IN_GROUP_ENA(1) |
1169 S_0286E8_TGID_ENA(1) |
1170 S_0286E8_DISABLE_INDEX_PACK(1));
1171
1172 /* The LOOP_CONST registers are an optimizations for loops that allows
1173 * you to store the initial counter, increment value, and maximum
1174 * counter value in a register so that hardware can calculate the
1175 * correct number of iterations for the loop, so that you don't need
1176 * to have the loop counter in your shader code. We don't currently use
1177 * this optimization, so we must keep track of the counter in the
1178 * shader and use a break instruction to exit loops. However, the
1179 * hardware will still uses this register to determine when to exit a
1180 * loop, so we need to initialize the counter to 0, set the increment
1181 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1182 * is the maximum value allowed. This gives us a maximum of 4096
1183 * iterations for our loops, but hopefully our break instruction will
1184 * execute before some time before the 4096th iteration.
1185 */
1186 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1187 }
1188
1189 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1190 {
1191 rctx->b.b.create_compute_state = evergreen_create_compute_state;
1192 rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1193 rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1194 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1195 rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1196 rctx->b.b.set_global_binding = evergreen_set_global_binding;
1197 rctx->b.b.launch_grid = evergreen_launch_grid;
1198
1199 }
1200
1201 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1202 struct pipe_resource *resource,
1203 unsigned level,
1204 unsigned usage,
1205 const struct pipe_box *box,
1206 struct pipe_transfer **ptransfer)
1207 {
1208 struct r600_context *rctx = (struct r600_context*)ctx;
1209 struct compute_memory_pool *pool = rctx->screen->global_pool;
1210 struct r600_resource_global* buffer =
1211 (struct r600_resource_global*)resource;
1212
1213 struct compute_memory_item *item = buffer->chunk;
1214 struct pipe_resource *dst = NULL;
1215 unsigned offset = box->x;
1216
1217 if (is_item_in_pool(item)) {
1218 compute_memory_demote_item(pool, item, ctx);
1219 }
1220 else {
1221 if (item->real_buffer == NULL) {
1222 item->real_buffer =
1223 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1224 }
1225 }
1226
1227 dst = (struct pipe_resource*)item->real_buffer;
1228
1229 if (usage & PIPE_TRANSFER_READ)
1230 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1231
1232 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1233 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1234 "width = %u, height = %u, depth = %u)\n", level, usage,
1235 box->x, box->y, box->z, box->width, box->height,
1236 box->depth);
1237 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1238 "%u (box.x)\n", item->id, box->x);
1239
1240
1241 assert(resource->target == PIPE_BUFFER);
1242 assert(resource->bind & PIPE_BIND_GLOBAL);
1243 assert(box->x >= 0);
1244 assert(box->y == 0);
1245 assert(box->z == 0);
1246
1247 ///TODO: do it better, mapping is not possible if the pool is too big
1248 return pipe_buffer_map_range(ctx, dst,
1249 offset, box->width, usage, ptransfer);
1250 }
1251
1252 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1253 struct pipe_transfer *transfer)
1254 {
1255 /* struct r600_resource_global are not real resources, they just map
1256 * to an offset within the compute memory pool. The function
1257 * r600_compute_global_transfer_map() maps the memory pool
1258 * resource rather than the struct r600_resource_global passed to
1259 * it as an argument and then initalizes ptransfer->resource with
1260 * the memory pool resource (via pipe_buffer_map_range).
1261 * When transfer_unmap is called it uses the memory pool's
1262 * vtable which calls r600_buffer_transfer_map() rather than
1263 * this function.
1264 */
1265 assert (!"This function should not be called");
1266 }
1267
1268 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1269 struct pipe_transfer *transfer,
1270 const struct pipe_box *box)
1271 {
1272 assert(0 && "TODO");
1273 }
1274
1275 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1276 struct pipe_resource *res)
1277 {
1278 struct r600_resource_global* buffer = NULL;
1279 struct r600_screen* rscreen = NULL;
1280
1281 assert(res->target == PIPE_BUFFER);
1282 assert(res->bind & PIPE_BIND_GLOBAL);
1283
1284 buffer = (struct r600_resource_global*)res;
1285 rscreen = (struct r600_screen*)screen;
1286
1287 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1288
1289 buffer->chunk = NULL;
1290 free(res);
1291 }
1292
1293 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1294 {
1295 u_default_resource_get_handle, /* get_handle */
1296 r600_compute_global_buffer_destroy, /* resource_destroy */
1297 r600_compute_global_transfer_map, /* transfer_map */
1298 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1299 r600_compute_global_transfer_unmap, /* transfer_unmap */
1300 };
1301
1302 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1303 const struct pipe_resource *templ)
1304 {
1305 struct r600_resource_global* result = NULL;
1306 struct r600_screen* rscreen = NULL;
1307 int size_in_dw = 0;
1308
1309 assert(templ->target == PIPE_BUFFER);
1310 assert(templ->bind & PIPE_BIND_GLOBAL);
1311 assert(templ->array_size == 1 || templ->array_size == 0);
1312 assert(templ->depth0 == 1 || templ->depth0 == 0);
1313 assert(templ->height0 == 1 || templ->height0 == 0);
1314
1315 result = (struct r600_resource_global*)
1316 CALLOC(sizeof(struct r600_resource_global), 1);
1317 rscreen = (struct r600_screen*)screen;
1318
1319 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1320 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1321 templ->array_size);
1322
1323 result->base.b.vtbl = &r600_global_buffer_vtbl;
1324 result->base.b.b = *templ;
1325 result->base.b.b.screen = screen;
1326 pipe_reference_init(&result->base.b.b.reference, 1);
1327
1328 size_in_dw = (templ->width0+3) / 4;
1329
1330 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1331
1332 if (result->chunk == NULL)
1333 {
1334 free(result);
1335 return NULL;
1336 }
1337
1338 return &result->base.b.b;
1339 }