37cfed4497bcb343932a49ea00a155bbb028a479
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #include <gelf.h>
28 #include <libelf.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include "pipe/p_defines.h"
32 #include "pipe/p_state.h"
33 #include "pipe/p_context.h"
34 #include "util/u_blitter.h"
35 #include "util/list.h"
36 #include "util/u_transfer.h"
37 #include "util/u_surface.h"
38 #include "util/u_pack_color.h"
39 #include "util/u_memory.h"
40 #include "util/u_inlines.h"
41 #include "util/u_framebuffer.h"
42 #include "pipebuffer/pb_buffer.h"
43 #include "evergreend.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "evergreen_compute_internal.h"
49 #include "compute_memory_pool.h"
50 #include "sb/sb_public.h"
51 #include <inttypes.h>
52
53 /**
54 RAT0 is for global binding write
55 VTX1 is for global binding read
56
57 for wrting images RAT1...
58 for reading images TEX2...
59 TEX2-RAT1 is paired
60
61 TEX2... consumes the same fetch resources, that VTX2... would consume
62
63 CONST0 and VTX0 is for parameters
64 CONST0 is binding smaller input parameter buffer, and for constant indexing,
65 also constant cached
66 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
67 the constant cache can handle
68
69 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
70 because we reserve RAT0 for global bindings. With byteaddressing enabled,
71 we should reserve another one too.=> 10 image binding for writing max.
72
73 from Nvidia OpenCL:
74 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
75 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
76
77 so 10 for writing is enough. 176 is the max for reading according to the docs
78
79 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
80 writable images will consume TEX slots, VTX slots too because of linear indexing
81
82 */
83
84 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
85 unsigned size)
86 {
87 struct pipe_resource *buffer = NULL;
88 assert(size);
89
90 buffer = pipe_buffer_create((struct pipe_screen*) screen,
91 0, PIPE_USAGE_IMMUTABLE, size);
92
93 return (struct r600_resource *)buffer;
94 }
95
96
97 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
98 unsigned id,
99 struct r600_resource *bo,
100 int start,
101 int size)
102 {
103 struct pipe_surface rat_templ;
104 struct r600_surface *surf = NULL;
105 struct r600_context *rctx = NULL;
106
107 assert(id < 12);
108 assert((size & 3) == 0);
109 assert((start & 0xFF) == 0);
110
111 rctx = pipe->ctx;
112
113 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
114
115 /* Create the RAT surface */
116 memset(&rat_templ, 0, sizeof(rat_templ));
117 rat_templ.format = PIPE_FORMAT_R32_UINT;
118 rat_templ.u.tex.level = 0;
119 rat_templ.u.tex.first_layer = 0;
120 rat_templ.u.tex.last_layer = 0;
121
122 /* Add the RAT the list of color buffers */
123 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
124 (struct pipe_context *)pipe->ctx,
125 (struct pipe_resource *)bo, &rat_templ);
126
127 /* Update the number of color buffers */
128 pipe->ctx->framebuffer.state.nr_cbufs =
129 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
130
131 /* Update the cb_target_mask
132 * XXX: I think this is a potential spot for bugs once we start doing
133 * GL interop. cb_target_mask may be modified in the 3D sections
134 * of this driver. */
135 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
136
137 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
138 evergreen_init_color_surface_rat(rctx, surf);
139 }
140
141 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
142 unsigned vb_index,
143 unsigned offset,
144 struct pipe_resource *buffer)
145 {
146 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
147 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
148 vb->stride = 1;
149 vb->buffer_offset = offset;
150 vb->buffer.resource = buffer;
151 vb->is_user_buffer = false;
152
153 /* The vertex instructions in the compute shaders use the texture cache,
154 * so we need to invalidate it. */
155 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
156 state->enabled_mask |= 1 << vb_index;
157 state->dirty_mask |= 1 << vb_index;
158 r600_mark_atom_dirty(rctx, &state->atom);
159 }
160
161 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
162 unsigned cb_index,
163 unsigned offset,
164 unsigned size,
165 struct pipe_resource *buffer)
166 {
167 struct pipe_constant_buffer cb;
168 cb.buffer_size = size;
169 cb.buffer_offset = offset;
170 cb.buffer = buffer;
171 cb.user_buffer = NULL;
172
173 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
174 }
175
176 /* We need to define these R600 registers here, because we can't include
177 * evergreend.h and r600d.h.
178 */
179 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
180 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
181
182 #ifdef HAVE_OPENCL
183 static void parse_symbol_table(Elf_Data *symbol_table_data,
184 const GElf_Shdr *symbol_table_header,
185 struct ac_shader_binary *binary)
186 {
187 GElf_Sym symbol;
188 unsigned i = 0;
189 unsigned symbol_count =
190 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
191
192 /* We are over allocating this list, because symbol_count gives the
193 * total number of symbols, and we will only be filling the list
194 * with offsets of global symbols. The memory savings from
195 * allocating the correct size of this list will be small, and
196 * I don't think it is worth the cost of pre-computing the number
197 * of global symbols.
198 */
199 binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
200
201 while (gelf_getsym(symbol_table_data, i++, &symbol)) {
202 unsigned i;
203 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
204 symbol.st_shndx == 0 /* Undefined symbol */) {
205 continue;
206 }
207
208 binary->global_symbol_offsets[binary->global_symbol_count] =
209 symbol.st_value;
210
211 /* Sort the list using bubble sort. This list will usually
212 * be small. */
213 for (i = binary->global_symbol_count; i > 0; --i) {
214 uint64_t lhs = binary->global_symbol_offsets[i - 1];
215 uint64_t rhs = binary->global_symbol_offsets[i];
216 if (lhs < rhs) {
217 break;
218 }
219 binary->global_symbol_offsets[i] = lhs;
220 binary->global_symbol_offsets[i - 1] = rhs;
221 }
222 ++binary->global_symbol_count;
223 }
224 }
225
226
227 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
228 unsigned symbol_sh_link,
229 struct ac_shader_binary *binary)
230 {
231 unsigned i;
232
233 if (!relocs || !symbols || !binary->reloc_count) {
234 return;
235 }
236 binary->relocs = CALLOC(binary->reloc_count,
237 sizeof(struct ac_shader_reloc));
238 for (i = 0; i < binary->reloc_count; i++) {
239 GElf_Sym symbol;
240 GElf_Rel rel;
241 char *symbol_name;
242 struct ac_shader_reloc *reloc = &binary->relocs[i];
243
244 gelf_getrel(relocs, i, &rel);
245 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
246 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
247
248 reloc->offset = rel.r_offset;
249 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
250 reloc->name[sizeof(reloc->name)-1] = 0;
251 }
252 }
253
254 static void r600_elf_read(const char *elf_data, unsigned elf_size,
255 struct ac_shader_binary *binary)
256 {
257 char *elf_buffer;
258 Elf *elf;
259 Elf_Scn *section = NULL;
260 Elf_Data *symbols = NULL, *relocs = NULL;
261 size_t section_str_index;
262 unsigned symbol_sh_link = 0;
263
264 /* One of the libelf implementations
265 * (http://www.mr511.de/software/english.htm) requires calling
266 * elf_version() before elf_memory().
267 */
268 elf_version(EV_CURRENT);
269 elf_buffer = MALLOC(elf_size);
270 memcpy(elf_buffer, elf_data, elf_size);
271
272 elf = elf_memory(elf_buffer, elf_size);
273
274 elf_getshdrstrndx(elf, &section_str_index);
275
276 while ((section = elf_nextscn(elf, section))) {
277 const char *name;
278 Elf_Data *section_data = NULL;
279 GElf_Shdr section_header;
280 if (gelf_getshdr(section, &section_header) != &section_header) {
281 fprintf(stderr, "Failed to read ELF section header\n");
282 return;
283 }
284 name = elf_strptr(elf, section_str_index, section_header.sh_name);
285 if (!strcmp(name, ".text")) {
286 section_data = elf_getdata(section, section_data);
287 binary->code_size = section_data->d_size;
288 binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
289 memcpy(binary->code, section_data->d_buf, binary->code_size);
290 } else if (!strcmp(name, ".AMDGPU.config")) {
291 section_data = elf_getdata(section, section_data);
292 binary->config_size = section_data->d_size;
293 binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
294 memcpy(binary->config, section_data->d_buf, binary->config_size);
295 } else if (!strcmp(name, ".AMDGPU.disasm")) {
296 /* Always read disassembly if it's available. */
297 section_data = elf_getdata(section, section_data);
298 binary->disasm_string = strndup(section_data->d_buf,
299 section_data->d_size);
300 } else if (!strncmp(name, ".rodata", 7)) {
301 section_data = elf_getdata(section, section_data);
302 binary->rodata_size = section_data->d_size;
303 binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
304 memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
305 } else if (!strncmp(name, ".symtab", 7)) {
306 symbols = elf_getdata(section, section_data);
307 symbol_sh_link = section_header.sh_link;
308 parse_symbol_table(symbols, &section_header, binary);
309 } else if (!strcmp(name, ".rel.text")) {
310 relocs = elf_getdata(section, section_data);
311 binary->reloc_count = section_header.sh_size /
312 section_header.sh_entsize;
313 }
314 }
315
316 parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
317
318 if (elf){
319 elf_end(elf);
320 }
321 FREE(elf_buffer);
322
323 /* Cache the config size per symbol */
324 if (binary->global_symbol_count) {
325 binary->config_size_per_symbol =
326 binary->config_size / binary->global_symbol_count;
327 } else {
328 binary->global_symbol_count = 1;
329 binary->config_size_per_symbol = binary->config_size;
330 }
331 }
332
333 static const unsigned char *r600_shader_binary_config_start(
334 const struct ac_shader_binary *binary,
335 uint64_t symbol_offset)
336 {
337 unsigned i;
338 for (i = 0; i < binary->global_symbol_count; ++i) {
339 if (binary->global_symbol_offsets[i] == symbol_offset) {
340 unsigned offset = i * binary->config_size_per_symbol;
341 return binary->config + offset;
342 }
343 }
344 return binary->config;
345 }
346
347 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
348 struct r600_bytecode *bc,
349 uint64_t symbol_offset,
350 boolean *use_kill)
351 {
352 unsigned i;
353 const unsigned char *config =
354 r600_shader_binary_config_start(binary, symbol_offset);
355
356 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
357 unsigned reg =
358 util_le32_to_cpu(*(uint32_t*)(config + i));
359 unsigned value =
360 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
361 switch (reg) {
362 /* R600 / R700 */
363 case R_028850_SQ_PGM_RESOURCES_PS:
364 case R_028868_SQ_PGM_RESOURCES_VS:
365 /* Evergreen / Northern Islands */
366 case R_028844_SQ_PGM_RESOURCES_PS:
367 case R_028860_SQ_PGM_RESOURCES_VS:
368 case R_0288D4_SQ_PGM_RESOURCES_LS:
369 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
370 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
371 break;
372 case R_02880C_DB_SHADER_CONTROL:
373 *use_kill = G_02880C_KILL_ENABLE(value);
374 break;
375 case R_0288E8_SQ_LDS_ALLOC:
376 bc->nlds_dw = value;
377 break;
378 }
379 }
380 }
381
382 static unsigned r600_create_shader(struct r600_bytecode *bc,
383 const struct ac_shader_binary *binary,
384 boolean *use_kill)
385
386 {
387 assert(binary->code_size % 4 == 0);
388 bc->bytecode = CALLOC(1, binary->code_size);
389 memcpy(bc->bytecode, binary->code, binary->code_size);
390 bc->ndw = binary->code_size / 4;
391
392 r600_shader_binary_read_config(binary, bc, 0, use_kill);
393 return 0;
394 }
395
396 #endif
397
398 static void r600_destroy_shader(struct r600_bytecode *bc)
399 {
400 FREE(bc->bytecode);
401 }
402
403 static void *evergreen_create_compute_state(struct pipe_context *ctx,
404 const struct pipe_compute_state *cso)
405 {
406 struct r600_context *rctx = (struct r600_context *)ctx;
407 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
408 #ifdef HAVE_OPENCL
409 const struct pipe_llvm_program_header *header;
410 const char *code;
411 void *p;
412 boolean use_kill;
413
414 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
415 header = cso->prog;
416 code = cso->prog + sizeof(struct pipe_llvm_program_header);
417 radeon_shader_binary_init(&shader->binary);
418 r600_elf_read(code, header->num_bytes, &shader->binary);
419 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
420
421 /* Upload code + ROdata */
422 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
423 shader->bc.ndw * 4);
424 p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
425 //TODO: use util_memcpy_cpu_to_le32 ?
426 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
427 rctx->b.ws->buffer_unmap(shader->code_bo->buf);
428 #endif
429
430 shader->ctx = rctx;
431 shader->local_size = cso->req_local_mem;
432 shader->private_size = cso->req_private_mem;
433 shader->input_size = cso->req_input_mem;
434
435 return shader;
436 }
437
438 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
439 {
440 struct r600_context *rctx = (struct r600_context *)ctx;
441 struct r600_pipe_compute *shader = state;
442
443 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
444
445 if (!shader)
446 return;
447
448 #ifdef HAVE_OPENCL
449 radeon_shader_binary_clean(&shader->binary);
450 #endif
451 r600_destroy_shader(&shader->bc);
452
453 /* TODO destroy shader->code_bo, shader->const_bo
454 * we'll need something like r600_buffer_free */
455 FREE(shader);
456 }
457
458 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
459 {
460 struct r600_context *rctx = (struct r600_context *)ctx;
461
462 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
463
464 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
465 }
466
467 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
468 * kernel parameters there are implicit parameters that need to be stored
469 * in the vertex buffer as well. Here is how these parameters are organized in
470 * the buffer:
471 *
472 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
473 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
474 * DWORDS 6-8: Number of work items within each work group in each dimension
475 * (x,y,z)
476 * DWORDS 9+ : Kernel parameters
477 */
478 static void evergreen_compute_upload_input(struct pipe_context *ctx,
479 const struct pipe_grid_info *info)
480 {
481 struct r600_context *rctx = (struct r600_context *)ctx;
482 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
483 unsigned i;
484 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
485 * parameters.
486 */
487 unsigned input_size = shader->input_size + 36;
488 uint32_t *num_work_groups_start;
489 uint32_t *global_size_start;
490 uint32_t *local_size_start;
491 uint32_t *kernel_parameters_start;
492 struct pipe_box box;
493 struct pipe_transfer *transfer = NULL;
494
495 if (shader->input_size == 0) {
496 return;
497 }
498
499 if (!shader->kernel_param) {
500 /* Add space for the grid dimensions */
501 shader->kernel_param = (struct r600_resource *)
502 pipe_buffer_create(ctx->screen, 0,
503 PIPE_USAGE_IMMUTABLE, input_size);
504 }
505
506 u_box_1d(0, input_size, &box);
507 num_work_groups_start = ctx->transfer_map(ctx,
508 (struct pipe_resource*)shader->kernel_param,
509 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
510 &box, &transfer);
511 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
512 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
513 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
514
515 /* Copy the work group size */
516 memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
517
518 /* Copy the global size */
519 for (i = 0; i < 3; i++) {
520 global_size_start[i] = info->grid[i] * info->block[i];
521 }
522
523 /* Copy the local dimensions */
524 memcpy(local_size_start, info->block, 3 * sizeof(uint));
525
526 /* Copy the kernel inputs */
527 memcpy(kernel_parameters_start, info->input, shader->input_size);
528
529 for (i = 0; i < (input_size / 4); i++) {
530 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
531 ((unsigned*)num_work_groups_start)[i]);
532 }
533
534 ctx->transfer_unmap(ctx, transfer);
535
536 /* ID=0 and ID=3 are reserved for the parameters.
537 * LLVM will preferably use ID=0, but it does not work for dynamic
538 * indices. */
539 evergreen_cs_set_vertex_buffer(rctx, 3, 0,
540 (struct pipe_resource*)shader->kernel_param);
541 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
542 (struct pipe_resource*)shader->kernel_param);
543 }
544
545 static void evergreen_emit_dispatch(struct r600_context *rctx,
546 const struct pipe_grid_info *info)
547 {
548 int i;
549 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
550 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
551 unsigned num_waves;
552 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
553 unsigned wave_divisor = (16 * num_pipes);
554 int group_size = 1;
555 int grid_size = 1;
556 unsigned lds_size = shader->local_size / 4 +
557 shader->bc.nlds_dw;
558
559
560 /* Calculate group_size/grid_size */
561 for (i = 0; i < 3; i++) {
562 group_size *= info->block[i];
563 }
564
565 for (i = 0; i < 3; i++) {
566 grid_size *= info->grid[i];
567 }
568
569 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
570 num_waves = (info->block[0] * info->block[1] * info->block[2] +
571 wave_divisor - 1) / wave_divisor;
572
573 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
574 "%u wavefronts per thread block, "
575 "allocating %u dwords lds.\n",
576 num_pipes, num_waves, lds_size);
577
578 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
579
580 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
581 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
582 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
583 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
584
585 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
586 group_size);
587
588 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
589 radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
590 radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
591 radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
592
593 if (rctx->b.chip_class < CAYMAN) {
594 assert(lds_size <= 8192);
595 } else {
596 /* Cayman appears to have a slightly smaller limit, see the
597 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
598 assert(lds_size <= 8160);
599 }
600
601 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
602 lds_size | (num_waves << 14));
603
604 /* Dispatch packet */
605 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
606 radeon_emit(cs, info->grid[0]);
607 radeon_emit(cs, info->grid[1]);
608 radeon_emit(cs, info->grid[2]);
609 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
610 radeon_emit(cs, 1);
611
612 if (rctx->is_debug)
613 eg_trace_emit(rctx);
614 }
615
616 static void compute_emit_cs(struct r600_context *rctx,
617 const struct pipe_grid_info *info)
618 {
619 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
620 unsigned i;
621
622 /* make sure that the gfx ring is only one active */
623 if (radeon_emitted(rctx->b.dma.cs, 0)) {
624 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
625 }
626
627 /* Initialize all the compute-related registers.
628 *
629 * See evergreen_init_atom_start_compute_cs() in this file for the list
630 * of registers initialized by the start_compute_cs_cmd atom.
631 */
632 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
633
634 /* emit config state */
635 if (rctx->b.chip_class == EVERGREEN)
636 r600_emit_atom(rctx, &rctx->config_state.atom);
637
638 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
639 r600_flush_emit(rctx);
640
641 /* Emit colorbuffers. */
642 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
643 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
644 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
645 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
646 (struct r600_resource*)cb->base.texture,
647 RADEON_USAGE_READWRITE,
648 RADEON_PRIO_SHADER_RW_BUFFER);
649
650 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
651 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
652 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
653 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
654 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
655 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
656 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
657 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
658
659 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
660 radeon_emit(cs, reloc);
661
662 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
663 radeon_emit(cs, reloc);
664 }
665 for (; i < 8 ; i++)
666 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
667 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
668 for (; i < 12; i++)
669 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
670 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
671
672 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
673 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
674 rctx->compute_cb_target_mask);
675
676
677 /* Emit vertex buffer state */
678 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
679 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
680
681 /* Emit constant buffer state */
682 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
683
684 /* Emit sampler state */
685 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
686
687 /* Emit sampler view (texture resource) state */
688 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
689
690 /* Emit compute shader state */
691 r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
692
693 /* Emit dispatch state and dispatch packet */
694 evergreen_emit_dispatch(rctx, info);
695
696 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
697 */
698 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
699 R600_CONTEXT_INV_VERTEX_CACHE |
700 R600_CONTEXT_INV_TEX_CACHE;
701 r600_flush_emit(rctx);
702 rctx->b.flags = 0;
703
704 if (rctx->b.chip_class >= CAYMAN) {
705 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
706 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
707 /* DEALLOC_STATE prevents the GPU from hanging when a
708 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
709 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
710 */
711 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
712 radeon_emit(cs, 0);
713 }
714
715 #if 0
716 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
717 for (i = 0; i < cs->cdw; i++) {
718 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
719 }
720 #endif
721
722 }
723
724
725 /**
726 * Emit function for r600_cs_shader_state atom
727 */
728 void evergreen_emit_cs_shader(struct r600_context *rctx,
729 struct r600_atom *atom)
730 {
731 struct r600_cs_shader_state *state =
732 (struct r600_cs_shader_state*)atom;
733 struct r600_pipe_compute *shader = state->shader;
734 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
735 uint64_t va;
736 struct r600_resource *code_bo;
737 unsigned ngpr, nstack;
738
739 code_bo = shader->code_bo;
740 va = shader->code_bo->gpu_address + state->pc;
741 ngpr = shader->bc.ngpr;
742 nstack = shader->bc.nstack;
743
744 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
745 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
746 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
747 S_0288D4_NUM_GPRS(ngpr)
748 | S_0288D4_STACK_SIZE(nstack));
749 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
750
751 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
752 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
753 code_bo, RADEON_USAGE_READ,
754 RADEON_PRIO_SHADER_BINARY));
755 }
756
757 static void evergreen_launch_grid(struct pipe_context *ctx,
758 const struct pipe_grid_info *info)
759 {
760 struct r600_context *rctx = (struct r600_context *)ctx;
761 #ifdef HAVE_OPENCL
762 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
763 boolean use_kill;
764
765 rctx->cs_shader_state.pc = info->pc;
766 /* Get the config information for this kernel. */
767 r600_shader_binary_read_config(&shader->binary, &shader->bc,
768 info->pc, &use_kill);
769 #endif
770
771 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
772
773
774 evergreen_compute_upload_input(ctx, info);
775 compute_emit_cs(rctx, info);
776 }
777
778 static void evergreen_set_compute_resources(struct pipe_context *ctx,
779 unsigned start, unsigned count,
780 struct pipe_surface **surfaces)
781 {
782 struct r600_context *rctx = (struct r600_context *)ctx;
783 struct r600_surface **resources = (struct r600_surface **)surfaces;
784
785 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
786 start, count);
787
788 for (unsigned i = 0; i < count; i++) {
789 /* The First four vertex buffers are reserved for parameters and
790 * global buffers. */
791 unsigned vtx_id = 4 + i;
792 if (resources[i]) {
793 struct r600_resource_global *buffer =
794 (struct r600_resource_global*)
795 resources[i]->base.texture;
796 if (resources[i]->base.writable) {
797 assert(i+1 < 12);
798
799 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
800 (struct r600_resource *)resources[i]->base.texture,
801 buffer->chunk->start_in_dw*4,
802 resources[i]->base.texture->width0);
803 }
804
805 evergreen_cs_set_vertex_buffer(rctx, vtx_id,
806 buffer->chunk->start_in_dw * 4,
807 resources[i]->base.texture);
808 }
809 }
810 }
811
812 static void evergreen_set_global_binding(struct pipe_context *ctx,
813 unsigned first, unsigned n,
814 struct pipe_resource **resources,
815 uint32_t **handles)
816 {
817 struct r600_context *rctx = (struct r600_context *)ctx;
818 struct compute_memory_pool *pool = rctx->screen->global_pool;
819 struct r600_resource_global **buffers =
820 (struct r600_resource_global **)resources;
821 unsigned i;
822
823 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
824 first, n);
825
826 if (!resources) {
827 /* XXX: Unset */
828 return;
829 }
830
831 /* We mark these items for promotion to the pool if they
832 * aren't already there */
833 for (i = first; i < first + n; i++) {
834 struct compute_memory_item *item = buffers[i]->chunk;
835
836 if (!is_item_in_pool(item))
837 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
838 }
839
840 if (compute_memory_finalize_pending(pool, ctx) == -1) {
841 /* XXX: Unset */
842 return;
843 }
844
845 for (i = first; i < first + n; i++)
846 {
847 uint32_t buffer_offset;
848 uint32_t handle;
849 assert(resources[i]->target == PIPE_BUFFER);
850 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
851
852 buffer_offset = util_le32_to_cpu(*(handles[i]));
853 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
854
855 *(handles[i]) = util_cpu_to_le32(handle);
856 }
857
858 /* globals for writing */
859 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
860 /* globals for reading */
861 evergreen_cs_set_vertex_buffer(rctx, 1, 0,
862 (struct pipe_resource*)pool->bo);
863
864 /* constants for reading, LLVM puts them in text segment */
865 evergreen_cs_set_vertex_buffer(rctx, 2, 0,
866 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
867 }
868
869 /**
870 * This function initializes all the compute specific registers that need to
871 * be initialized for each compute command stream. Registers that are common
872 * to both compute and 3D will be initialized at the beginning of each compute
873 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
874 * packet requires that the shader type bit be set, we must initialize all
875 * context registers needed for compute in this function. The registers
876 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
877 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
878 * on the GPU family.
879 */
880 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
881 {
882 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
883 int num_threads;
884 int num_stack_entries;
885
886 /* since all required registers are initialized in the
887 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
888 */
889 r600_init_command_buffer(cb, 256);
890 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
891
892 /* This must be first. */
893 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
894 r600_store_value(cb, 0x80000000);
895 r600_store_value(cb, 0x80000000);
896
897 /* We're setting config registers here. */
898 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
899 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
900
901 switch (rctx->b.family) {
902 case CHIP_CEDAR:
903 default:
904 num_threads = 128;
905 num_stack_entries = 256;
906 break;
907 case CHIP_REDWOOD:
908 num_threads = 128;
909 num_stack_entries = 256;
910 break;
911 case CHIP_JUNIPER:
912 num_threads = 128;
913 num_stack_entries = 512;
914 break;
915 case CHIP_CYPRESS:
916 case CHIP_HEMLOCK:
917 num_threads = 128;
918 num_stack_entries = 512;
919 break;
920 case CHIP_PALM:
921 num_threads = 128;
922 num_stack_entries = 256;
923 break;
924 case CHIP_SUMO:
925 num_threads = 128;
926 num_stack_entries = 256;
927 break;
928 case CHIP_SUMO2:
929 num_threads = 128;
930 num_stack_entries = 512;
931 break;
932 case CHIP_BARTS:
933 num_threads = 128;
934 num_stack_entries = 512;
935 break;
936 case CHIP_TURKS:
937 num_threads = 128;
938 num_stack_entries = 256;
939 break;
940 case CHIP_CAICOS:
941 num_threads = 128;
942 num_stack_entries = 256;
943 break;
944 }
945
946 /* Config Registers */
947 if (rctx->b.chip_class < CAYMAN)
948 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
949 rctx->screen->b.info.drm_minor);
950 else
951 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
952 rctx->screen->b.info.drm_minor);
953
954 /* The primitive type always needs to be POINTLIST for compute. */
955 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
956 V_008958_DI_PT_POINTLIST);
957
958 if (rctx->b.chip_class < CAYMAN) {
959
960 /* These registers control which simds can be used by each stage.
961 * The default for these registers is 0xffffffff, which means
962 * all simds are available for each stage. It's possible we may
963 * want to play around with these in the future, but for now
964 * the default value is fine.
965 *
966 * R_008E20_SQ_STATIC_THREAD_MGMT1
967 * R_008E24_SQ_STATIC_THREAD_MGMT2
968 * R_008E28_SQ_STATIC_THREAD_MGMT3
969 */
970
971 /* XXX: We may need to adjust the thread and stack resource
972 * values for 3D/compute interop */
973
974 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
975
976 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
977 * Set the number of threads used by the PS/VS/GS/ES stage to
978 * 0.
979 */
980 r600_store_value(cb, 0);
981
982 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
983 * Set the number of threads used by the CS (aka LS) stage to
984 * the maximum number of threads and set the number of threads
985 * for the HS stage to 0. */
986 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
987
988 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
989 * Set the Control Flow stack entries to 0 for PS/VS stages */
990 r600_store_value(cb, 0);
991
992 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
993 * Set the Control Flow stack entries to 0 for GS/ES stages */
994 r600_store_value(cb, 0);
995
996 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
997 * Set the Contol Flow stack entries to 0 for the HS stage, and
998 * set it to the maximum value for the CS (aka LS) stage. */
999 r600_store_value(cb,
1000 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1001 }
1002 /* Give the compute shader all the available LDS space.
1003 * NOTE: This only sets the maximum number of dwords that a compute
1004 * shader can allocate. When a shader is executed, we still need to
1005 * allocate the appropriate amount of LDS dwords using the
1006 * CM_R_0288E8_SQ_LDS_ALLOC register.
1007 */
1008 if (rctx->b.chip_class < CAYMAN) {
1009 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1010 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1011 } else {
1012 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1013 S_0286FC_NUM_PS_LDS(0) |
1014 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1015 }
1016
1017 /* Context Registers */
1018
1019 if (rctx->b.chip_class < CAYMAN) {
1020 /* workaround for hw issues with dyn gpr - must set all limits
1021 * to 240 instead of 0, 0x1e == 240 / 8
1022 */
1023 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1024 S_028838_PS_GPRS(0x1e) |
1025 S_028838_VS_GPRS(0x1e) |
1026 S_028838_GS_GPRS(0x1e) |
1027 S_028838_ES_GPRS(0x1e) |
1028 S_028838_HS_GPRS(0x1e) |
1029 S_028838_LS_GPRS(0x1e));
1030 }
1031
1032 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1033 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1034 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1035
1036 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1037
1038 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1039 S_0286E8_TID_IN_GROUP_ENA(1) |
1040 S_0286E8_TGID_ENA(1) |
1041 S_0286E8_DISABLE_INDEX_PACK(1));
1042
1043 /* The LOOP_CONST registers are an optimizations for loops that allows
1044 * you to store the initial counter, increment value, and maximum
1045 * counter value in a register so that hardware can calculate the
1046 * correct number of iterations for the loop, so that you don't need
1047 * to have the loop counter in your shader code. We don't currently use
1048 * this optimization, so we must keep track of the counter in the
1049 * shader and use a break instruction to exit loops. However, the
1050 * hardware will still uses this register to determine when to exit a
1051 * loop, so we need to initialize the counter to 0, set the increment
1052 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1053 * is the maximum value allowed. This gives us a maximum of 4096
1054 * iterations for our loops, but hopefully our break instruction will
1055 * execute before some time before the 4096th iteration.
1056 */
1057 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1058 }
1059
1060 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1061 {
1062 rctx->b.b.create_compute_state = evergreen_create_compute_state;
1063 rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1064 rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1065 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1066 rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1067 rctx->b.b.set_global_binding = evergreen_set_global_binding;
1068 rctx->b.b.launch_grid = evergreen_launch_grid;
1069
1070 }
1071
1072 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1073 struct pipe_resource *resource,
1074 unsigned level,
1075 unsigned usage,
1076 const struct pipe_box *box,
1077 struct pipe_transfer **ptransfer)
1078 {
1079 struct r600_context *rctx = (struct r600_context*)ctx;
1080 struct compute_memory_pool *pool = rctx->screen->global_pool;
1081 struct r600_resource_global* buffer =
1082 (struct r600_resource_global*)resource;
1083
1084 struct compute_memory_item *item = buffer->chunk;
1085 struct pipe_resource *dst = NULL;
1086 unsigned offset = box->x;
1087
1088 if (is_item_in_pool(item)) {
1089 compute_memory_demote_item(pool, item, ctx);
1090 }
1091 else {
1092 if (item->real_buffer == NULL) {
1093 item->real_buffer =
1094 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1095 }
1096 }
1097
1098 dst = (struct pipe_resource*)item->real_buffer;
1099
1100 if (usage & PIPE_TRANSFER_READ)
1101 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1102
1103 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1104 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1105 "width = %u, height = %u, depth = %u)\n", level, usage,
1106 box->x, box->y, box->z, box->width, box->height,
1107 box->depth);
1108 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1109 "%u (box.x)\n", item->id, box->x);
1110
1111
1112 assert(resource->target == PIPE_BUFFER);
1113 assert(resource->bind & PIPE_BIND_GLOBAL);
1114 assert(box->x >= 0);
1115 assert(box->y == 0);
1116 assert(box->z == 0);
1117
1118 ///TODO: do it better, mapping is not possible if the pool is too big
1119 return pipe_buffer_map_range(ctx, dst,
1120 offset, box->width, usage, ptransfer);
1121 }
1122
1123 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1124 struct pipe_transfer *transfer)
1125 {
1126 /* struct r600_resource_global are not real resources, they just map
1127 * to an offset within the compute memory pool. The function
1128 * r600_compute_global_transfer_map() maps the memory pool
1129 * resource rather than the struct r600_resource_global passed to
1130 * it as an argument and then initalizes ptransfer->resource with
1131 * the memory pool resource (via pipe_buffer_map_range).
1132 * When transfer_unmap is called it uses the memory pool's
1133 * vtable which calls r600_buffer_transfer_map() rather than
1134 * this function.
1135 */
1136 assert (!"This function should not be called");
1137 }
1138
1139 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1140 struct pipe_transfer *transfer,
1141 const struct pipe_box *box)
1142 {
1143 assert(0 && "TODO");
1144 }
1145
1146 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1147 struct pipe_resource *res)
1148 {
1149 struct r600_resource_global* buffer = NULL;
1150 struct r600_screen* rscreen = NULL;
1151
1152 assert(res->target == PIPE_BUFFER);
1153 assert(res->bind & PIPE_BIND_GLOBAL);
1154
1155 buffer = (struct r600_resource_global*)res;
1156 rscreen = (struct r600_screen*)screen;
1157
1158 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1159
1160 buffer->chunk = NULL;
1161 free(res);
1162 }
1163
1164 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1165 {
1166 u_default_resource_get_handle, /* get_handle */
1167 r600_compute_global_buffer_destroy, /* resource_destroy */
1168 r600_compute_global_transfer_map, /* transfer_map */
1169 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1170 r600_compute_global_transfer_unmap, /* transfer_unmap */
1171 };
1172
1173 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1174 const struct pipe_resource *templ)
1175 {
1176 struct r600_resource_global* result = NULL;
1177 struct r600_screen* rscreen = NULL;
1178 int size_in_dw = 0;
1179
1180 assert(templ->target == PIPE_BUFFER);
1181 assert(templ->bind & PIPE_BIND_GLOBAL);
1182 assert(templ->array_size == 1 || templ->array_size == 0);
1183 assert(templ->depth0 == 1 || templ->depth0 == 0);
1184 assert(templ->height0 == 1 || templ->height0 == 0);
1185
1186 result = (struct r600_resource_global*)
1187 CALLOC(sizeof(struct r600_resource_global), 1);
1188 rscreen = (struct r600_screen*)screen;
1189
1190 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1191 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1192 templ->array_size);
1193
1194 result->base.b.vtbl = &r600_global_buffer_vtbl;
1195 result->base.b.b = *templ;
1196 result->base.b.b.screen = screen;
1197 pipe_reference_init(&result->base.b.b.reference, 1);
1198
1199 size_in_dw = (templ->width0+3) / 4;
1200
1201 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1202
1203 if (result->chunk == NULL)
1204 {
1205 free(result);
1206 return NULL;
1207 }
1208
1209 return &result->base.b.b;
1210 }