0573f8ee2b5f5e0fe44434aefdcf74ef1c6b1605
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #ifdef HAVE_OPENCL
28 #include <gelf.h>
29 #include <libelf.h>
30 #endif
31 #include <stdio.h>
32 #include <errno.h>
33 #include "pipe/p_defines.h"
34 #include "pipe/p_state.h"
35 #include "pipe/p_context.h"
36 #include "util/u_blitter.h"
37 #include "util/list.h"
38 #include "util/u_transfer.h"
39 #include "util/u_surface.h"
40 #include "util/u_pack_color.h"
41 #include "util/u_memory.h"
42 #include "util/u_inlines.h"
43 #include "util/u_framebuffer.h"
44 #include "pipebuffer/pb_buffer.h"
45 #include "evergreend.h"
46 #include "r600_shader.h"
47 #include "r600_pipe.h"
48 #include "r600_formats.h"
49 #include "evergreen_compute.h"
50 #include "evergreen_compute_internal.h"
51 #include "compute_memory_pool.h"
52 #include "sb/sb_public.h"
53 #include <inttypes.h>
54
55 /**
56 RAT0 is for global binding write
57 VTX1 is for global binding read
58
59 for wrting images RAT1...
60 for reading images TEX2...
61 TEX2-RAT1 is paired
62
63 TEX2... consumes the same fetch resources, that VTX2... would consume
64
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
67 also constant cached
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
70
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
74
75 from Nvidia OpenCL:
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
78
79 so 10 for writing is enough. 176 is the max for reading according to the docs
80
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
83
84 */
85
86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
87 unsigned size)
88 {
89 struct pipe_resource *buffer = NULL;
90 assert(size);
91
92 buffer = pipe_buffer_create((struct pipe_screen*) screen,
93 0, PIPE_USAGE_IMMUTABLE, size);
94
95 return (struct r600_resource *)buffer;
96 }
97
98
99 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
100 unsigned id,
101 struct r600_resource *bo,
102 int start,
103 int size)
104 {
105 struct pipe_surface rat_templ;
106 struct r600_surface *surf = NULL;
107 struct r600_context *rctx = NULL;
108
109 assert(id < 12);
110 assert((size & 3) == 0);
111 assert((start & 0xFF) == 0);
112
113 rctx = pipe->ctx;
114
115 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
116
117 /* Create the RAT surface */
118 memset(&rat_templ, 0, sizeof(rat_templ));
119 rat_templ.format = PIPE_FORMAT_R32_UINT;
120 rat_templ.u.tex.level = 0;
121 rat_templ.u.tex.first_layer = 0;
122 rat_templ.u.tex.last_layer = 0;
123
124 /* Add the RAT the list of color buffers */
125 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
126 (struct pipe_context *)pipe->ctx,
127 (struct pipe_resource *)bo, &rat_templ);
128
129 /* Update the number of color buffers */
130 pipe->ctx->framebuffer.state.nr_cbufs =
131 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
132
133 /* Update the cb_target_mask
134 * XXX: I think this is a potential spot for bugs once we start doing
135 * GL interop. cb_target_mask may be modified in the 3D sections
136 * of this driver. */
137 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
138
139 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
140 evergreen_init_color_surface_rat(rctx, surf);
141 }
142
143 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
144 unsigned vb_index,
145 unsigned offset,
146 struct pipe_resource *buffer)
147 {
148 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
149 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
150 vb->stride = 1;
151 vb->buffer_offset = offset;
152 vb->buffer.resource = buffer;
153 vb->is_user_buffer = false;
154
155 /* The vertex instructions in the compute shaders use the texture cache,
156 * so we need to invalidate it. */
157 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
158 state->enabled_mask |= 1 << vb_index;
159 state->dirty_mask |= 1 << vb_index;
160 r600_mark_atom_dirty(rctx, &state->atom);
161 }
162
163 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
164 unsigned cb_index,
165 unsigned offset,
166 unsigned size,
167 struct pipe_resource *buffer)
168 {
169 struct pipe_constant_buffer cb;
170 cb.buffer_size = size;
171 cb.buffer_offset = offset;
172 cb.buffer = buffer;
173 cb.user_buffer = NULL;
174
175 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
176 }
177
178 /* We need to define these R600 registers here, because we can't include
179 * evergreend.h and r600d.h.
180 */
181 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
182 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
183
184 #ifdef HAVE_OPENCL
185 static void parse_symbol_table(Elf_Data *symbol_table_data,
186 const GElf_Shdr *symbol_table_header,
187 struct ac_shader_binary *binary)
188 {
189 GElf_Sym symbol;
190 unsigned i = 0;
191 unsigned symbol_count =
192 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
193
194 /* We are over allocating this list, because symbol_count gives the
195 * total number of symbols, and we will only be filling the list
196 * with offsets of global symbols. The memory savings from
197 * allocating the correct size of this list will be small, and
198 * I don't think it is worth the cost of pre-computing the number
199 * of global symbols.
200 */
201 binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
202
203 while (gelf_getsym(symbol_table_data, i++, &symbol)) {
204 unsigned i;
205 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
206 symbol.st_shndx == 0 /* Undefined symbol */) {
207 continue;
208 }
209
210 binary->global_symbol_offsets[binary->global_symbol_count] =
211 symbol.st_value;
212
213 /* Sort the list using bubble sort. This list will usually
214 * be small. */
215 for (i = binary->global_symbol_count; i > 0; --i) {
216 uint64_t lhs = binary->global_symbol_offsets[i - 1];
217 uint64_t rhs = binary->global_symbol_offsets[i];
218 if (lhs < rhs) {
219 break;
220 }
221 binary->global_symbol_offsets[i] = lhs;
222 binary->global_symbol_offsets[i - 1] = rhs;
223 }
224 ++binary->global_symbol_count;
225 }
226 }
227
228
229 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
230 unsigned symbol_sh_link,
231 struct ac_shader_binary *binary)
232 {
233 unsigned i;
234
235 if (!relocs || !symbols || !binary->reloc_count) {
236 return;
237 }
238 binary->relocs = CALLOC(binary->reloc_count,
239 sizeof(struct ac_shader_reloc));
240 for (i = 0; i < binary->reloc_count; i++) {
241 GElf_Sym symbol;
242 GElf_Rel rel;
243 char *symbol_name;
244 struct ac_shader_reloc *reloc = &binary->relocs[i];
245
246 gelf_getrel(relocs, i, &rel);
247 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
248 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
249
250 reloc->offset = rel.r_offset;
251 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
252 reloc->name[sizeof(reloc->name)-1] = 0;
253 }
254 }
255
256 static void r600_elf_read(const char *elf_data, unsigned elf_size,
257 struct ac_shader_binary *binary)
258 {
259 char *elf_buffer;
260 Elf *elf;
261 Elf_Scn *section = NULL;
262 Elf_Data *symbols = NULL, *relocs = NULL;
263 size_t section_str_index;
264 unsigned symbol_sh_link = 0;
265
266 /* One of the libelf implementations
267 * (http://www.mr511.de/software/english.htm) requires calling
268 * elf_version() before elf_memory().
269 */
270 elf_version(EV_CURRENT);
271 elf_buffer = MALLOC(elf_size);
272 memcpy(elf_buffer, elf_data, elf_size);
273
274 elf = elf_memory(elf_buffer, elf_size);
275
276 elf_getshdrstrndx(elf, &section_str_index);
277
278 while ((section = elf_nextscn(elf, section))) {
279 const char *name;
280 Elf_Data *section_data = NULL;
281 GElf_Shdr section_header;
282 if (gelf_getshdr(section, &section_header) != &section_header) {
283 fprintf(stderr, "Failed to read ELF section header\n");
284 return;
285 }
286 name = elf_strptr(elf, section_str_index, section_header.sh_name);
287 if (!strcmp(name, ".text")) {
288 section_data = elf_getdata(section, section_data);
289 binary->code_size = section_data->d_size;
290 binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
291 memcpy(binary->code, section_data->d_buf, binary->code_size);
292 } else if (!strcmp(name, ".AMDGPU.config")) {
293 section_data = elf_getdata(section, section_data);
294 binary->config_size = section_data->d_size;
295 binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
296 memcpy(binary->config, section_data->d_buf, binary->config_size);
297 } else if (!strcmp(name, ".AMDGPU.disasm")) {
298 /* Always read disassembly if it's available. */
299 section_data = elf_getdata(section, section_data);
300 binary->disasm_string = strndup(section_data->d_buf,
301 section_data->d_size);
302 } else if (!strncmp(name, ".rodata", 7)) {
303 section_data = elf_getdata(section, section_data);
304 binary->rodata_size = section_data->d_size;
305 binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
306 memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
307 } else if (!strncmp(name, ".symtab", 7)) {
308 symbols = elf_getdata(section, section_data);
309 symbol_sh_link = section_header.sh_link;
310 parse_symbol_table(symbols, &section_header, binary);
311 } else if (!strcmp(name, ".rel.text")) {
312 relocs = elf_getdata(section, section_data);
313 binary->reloc_count = section_header.sh_size /
314 section_header.sh_entsize;
315 }
316 }
317
318 parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
319
320 if (elf){
321 elf_end(elf);
322 }
323 FREE(elf_buffer);
324
325 /* Cache the config size per symbol */
326 if (binary->global_symbol_count) {
327 binary->config_size_per_symbol =
328 binary->config_size / binary->global_symbol_count;
329 } else {
330 binary->global_symbol_count = 1;
331 binary->config_size_per_symbol = binary->config_size;
332 }
333 }
334
335 static const unsigned char *r600_shader_binary_config_start(
336 const struct ac_shader_binary *binary,
337 uint64_t symbol_offset)
338 {
339 unsigned i;
340 for (i = 0; i < binary->global_symbol_count; ++i) {
341 if (binary->global_symbol_offsets[i] == symbol_offset) {
342 unsigned offset = i * binary->config_size_per_symbol;
343 return binary->config + offset;
344 }
345 }
346 return binary->config;
347 }
348
349 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
350 struct r600_bytecode *bc,
351 uint64_t symbol_offset,
352 boolean *use_kill)
353 {
354 unsigned i;
355 const unsigned char *config =
356 r600_shader_binary_config_start(binary, symbol_offset);
357
358 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
359 unsigned reg =
360 util_le32_to_cpu(*(uint32_t*)(config + i));
361 unsigned value =
362 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
363 switch (reg) {
364 /* R600 / R700 */
365 case R_028850_SQ_PGM_RESOURCES_PS:
366 case R_028868_SQ_PGM_RESOURCES_VS:
367 /* Evergreen / Northern Islands */
368 case R_028844_SQ_PGM_RESOURCES_PS:
369 case R_028860_SQ_PGM_RESOURCES_VS:
370 case R_0288D4_SQ_PGM_RESOURCES_LS:
371 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
372 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
373 break;
374 case R_02880C_DB_SHADER_CONTROL:
375 *use_kill = G_02880C_KILL_ENABLE(value);
376 break;
377 case R_0288E8_SQ_LDS_ALLOC:
378 bc->nlds_dw = value;
379 break;
380 }
381 }
382 }
383
384 static unsigned r600_create_shader(struct r600_bytecode *bc,
385 const struct ac_shader_binary *binary,
386 boolean *use_kill)
387
388 {
389 assert(binary->code_size % 4 == 0);
390 bc->bytecode = CALLOC(1, binary->code_size);
391 memcpy(bc->bytecode, binary->code, binary->code_size);
392 bc->ndw = binary->code_size / 4;
393
394 r600_shader_binary_read_config(binary, bc, 0, use_kill);
395 return 0;
396 }
397
398 #endif
399
400 static void r600_destroy_shader(struct r600_bytecode *bc)
401 {
402 FREE(bc->bytecode);
403 }
404
405 static void *evergreen_create_compute_state(struct pipe_context *ctx,
406 const struct pipe_compute_state *cso)
407 {
408 struct r600_context *rctx = (struct r600_context *)ctx;
409 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
410 #ifdef HAVE_OPENCL
411 const struct pipe_llvm_program_header *header;
412 const char *code;
413 void *p;
414 boolean use_kill;
415
416 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
417 header = cso->prog;
418 code = cso->prog + sizeof(struct pipe_llvm_program_header);
419 radeon_shader_binary_init(&shader->binary);
420 r600_elf_read(code, header->num_bytes, &shader->binary);
421 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
422
423 /* Upload code + ROdata */
424 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
425 shader->bc.ndw * 4);
426 p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
427 //TODO: use util_memcpy_cpu_to_le32 ?
428 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
429 rctx->b.ws->buffer_unmap(shader->code_bo->buf);
430 #endif
431
432 shader->ctx = rctx;
433 shader->local_size = cso->req_local_mem;
434 shader->private_size = cso->req_private_mem;
435 shader->input_size = cso->req_input_mem;
436
437 return shader;
438 }
439
440 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
441 {
442 struct r600_context *rctx = (struct r600_context *)ctx;
443 struct r600_pipe_compute *shader = state;
444
445 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
446
447 if (!shader)
448 return;
449
450 #ifdef HAVE_OPENCL
451 radeon_shader_binary_clean(&shader->binary);
452 #endif
453 r600_destroy_shader(&shader->bc);
454
455 /* TODO destroy shader->code_bo, shader->const_bo
456 * we'll need something like r600_buffer_free */
457 FREE(shader);
458 }
459
460 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
461 {
462 struct r600_context *rctx = (struct r600_context *)ctx;
463
464 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
465
466 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
467 }
468
469 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
470 * kernel parameters there are implicit parameters that need to be stored
471 * in the vertex buffer as well. Here is how these parameters are organized in
472 * the buffer:
473 *
474 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
475 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
476 * DWORDS 6-8: Number of work items within each work group in each dimension
477 * (x,y,z)
478 * DWORDS 9+ : Kernel parameters
479 */
480 static void evergreen_compute_upload_input(struct pipe_context *ctx,
481 const struct pipe_grid_info *info)
482 {
483 struct r600_context *rctx = (struct r600_context *)ctx;
484 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
485 unsigned i;
486 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
487 * parameters.
488 */
489 unsigned input_size = shader->input_size + 36;
490 uint32_t *num_work_groups_start;
491 uint32_t *global_size_start;
492 uint32_t *local_size_start;
493 uint32_t *kernel_parameters_start;
494 struct pipe_box box;
495 struct pipe_transfer *transfer = NULL;
496
497 if (shader->input_size == 0) {
498 return;
499 }
500
501 if (!shader->kernel_param) {
502 /* Add space for the grid dimensions */
503 shader->kernel_param = (struct r600_resource *)
504 pipe_buffer_create(ctx->screen, 0,
505 PIPE_USAGE_IMMUTABLE, input_size);
506 }
507
508 u_box_1d(0, input_size, &box);
509 num_work_groups_start = ctx->transfer_map(ctx,
510 (struct pipe_resource*)shader->kernel_param,
511 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
512 &box, &transfer);
513 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
514 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
515 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
516
517 /* Copy the work group size */
518 memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
519
520 /* Copy the global size */
521 for (i = 0; i < 3; i++) {
522 global_size_start[i] = info->grid[i] * info->block[i];
523 }
524
525 /* Copy the local dimensions */
526 memcpy(local_size_start, info->block, 3 * sizeof(uint));
527
528 /* Copy the kernel inputs */
529 memcpy(kernel_parameters_start, info->input, shader->input_size);
530
531 for (i = 0; i < (input_size / 4); i++) {
532 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
533 ((unsigned*)num_work_groups_start)[i]);
534 }
535
536 ctx->transfer_unmap(ctx, transfer);
537
538 /* ID=0 and ID=3 are reserved for the parameters.
539 * LLVM will preferably use ID=0, but it does not work for dynamic
540 * indices. */
541 evergreen_cs_set_vertex_buffer(rctx, 3, 0,
542 (struct pipe_resource*)shader->kernel_param);
543 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
544 (struct pipe_resource*)shader->kernel_param);
545 }
546
547 static void evergreen_emit_dispatch(struct r600_context *rctx,
548 const struct pipe_grid_info *info)
549 {
550 int i;
551 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
552 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
553 unsigned num_waves;
554 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
555 unsigned wave_divisor = (16 * num_pipes);
556 int group_size = 1;
557 int grid_size = 1;
558 unsigned lds_size = shader->local_size / 4 +
559 shader->bc.nlds_dw;
560
561
562 /* Calculate group_size/grid_size */
563 for (i = 0; i < 3; i++) {
564 group_size *= info->block[i];
565 }
566
567 for (i = 0; i < 3; i++) {
568 grid_size *= info->grid[i];
569 }
570
571 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
572 num_waves = (info->block[0] * info->block[1] * info->block[2] +
573 wave_divisor - 1) / wave_divisor;
574
575 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
576 "%u wavefronts per thread block, "
577 "allocating %u dwords lds.\n",
578 num_pipes, num_waves, lds_size);
579
580 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
581
582 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
583 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
584 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
585 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
586
587 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
588 group_size);
589
590 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
591 radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
592 radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
593 radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
594
595 if (rctx->b.chip_class < CAYMAN) {
596 assert(lds_size <= 8192);
597 } else {
598 /* Cayman appears to have a slightly smaller limit, see the
599 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
600 assert(lds_size <= 8160);
601 }
602
603 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
604 lds_size | (num_waves << 14));
605
606 /* Dispatch packet */
607 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
608 radeon_emit(cs, info->grid[0]);
609 radeon_emit(cs, info->grid[1]);
610 radeon_emit(cs, info->grid[2]);
611 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
612 radeon_emit(cs, 1);
613
614 if (rctx->is_debug)
615 eg_trace_emit(rctx);
616 }
617
618 static void compute_setup_cbs(struct r600_context *rctx)
619 {
620 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
621 unsigned i;
622
623 /* Emit colorbuffers. */
624 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
625 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
626 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
627 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
628 (struct r600_resource*)cb->base.texture,
629 RADEON_USAGE_READWRITE,
630 RADEON_PRIO_SHADER_RW_BUFFER);
631
632 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
633 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
634 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
635 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
636 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
637 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
638 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
639 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
640
641 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
642 radeon_emit(cs, reloc);
643
644 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
645 radeon_emit(cs, reloc);
646 }
647 for (; i < 8 ; i++)
648 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
649 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
650 for (; i < 12; i++)
651 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
652 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
653
654 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
655 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
656 rctx->compute_cb_target_mask);
657 }
658
659 static void compute_emit_cs(struct r600_context *rctx,
660 const struct pipe_grid_info *info)
661 {
662 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
663
664 /* make sure that the gfx ring is only one active */
665 if (radeon_emitted(rctx->b.dma.cs, 0)) {
666 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
667 }
668
669 /* Initialize all the compute-related registers.
670 *
671 * See evergreen_init_atom_start_compute_cs() in this file for the list
672 * of registers initialized by the start_compute_cs_cmd atom.
673 */
674 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
675
676 /* emit config state */
677 if (rctx->b.chip_class == EVERGREEN)
678 r600_emit_atom(rctx, &rctx->config_state.atom);
679
680 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
681 r600_flush_emit(rctx);
682
683 compute_setup_cbs(rctx);
684
685 /* Emit vertex buffer state */
686 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
687 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
688
689 /* Emit constant buffer state */
690 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
691
692 /* Emit sampler state */
693 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
694
695 /* Emit sampler view (texture resource) state */
696 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
697
698 /* Emit compute shader state */
699 r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
700
701 /* Emit dispatch state and dispatch packet */
702 evergreen_emit_dispatch(rctx, info);
703
704 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
705 */
706 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
707 R600_CONTEXT_INV_VERTEX_CACHE |
708 R600_CONTEXT_INV_TEX_CACHE;
709 r600_flush_emit(rctx);
710 rctx->b.flags = 0;
711
712 if (rctx->b.chip_class >= CAYMAN) {
713 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
714 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
715 /* DEALLOC_STATE prevents the GPU from hanging when a
716 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
717 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
718 */
719 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
720 radeon_emit(cs, 0);
721 }
722
723 #if 0
724 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
725 for (i = 0; i < cs->cdw; i++) {
726 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
727 }
728 #endif
729
730 }
731
732
733 /**
734 * Emit function for r600_cs_shader_state atom
735 */
736 void evergreen_emit_cs_shader(struct r600_context *rctx,
737 struct r600_atom *atom)
738 {
739 struct r600_cs_shader_state *state =
740 (struct r600_cs_shader_state*)atom;
741 struct r600_pipe_compute *shader = state->shader;
742 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
743 uint64_t va;
744 struct r600_resource *code_bo;
745 unsigned ngpr, nstack;
746
747 code_bo = shader->code_bo;
748 va = shader->code_bo->gpu_address + state->pc;
749 ngpr = shader->bc.ngpr;
750 nstack = shader->bc.nstack;
751
752 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
753 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
754 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
755 S_0288D4_NUM_GPRS(ngpr) |
756 S_0288D4_DX10_CLAMP(1) |
757 S_0288D4_STACK_SIZE(nstack));
758 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
759
760 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
761 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
762 code_bo, RADEON_USAGE_READ,
763 RADEON_PRIO_SHADER_BINARY));
764 }
765
766 static void evergreen_launch_grid(struct pipe_context *ctx,
767 const struct pipe_grid_info *info)
768 {
769 struct r600_context *rctx = (struct r600_context *)ctx;
770 #ifdef HAVE_OPENCL
771 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
772 boolean use_kill;
773
774 rctx->cs_shader_state.pc = info->pc;
775 /* Get the config information for this kernel. */
776 r600_shader_binary_read_config(&shader->binary, &shader->bc,
777 info->pc, &use_kill);
778 #endif
779
780 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
781
782
783 evergreen_compute_upload_input(ctx, info);
784 compute_emit_cs(rctx, info);
785 }
786
787 static void evergreen_set_compute_resources(struct pipe_context *ctx,
788 unsigned start, unsigned count,
789 struct pipe_surface **surfaces)
790 {
791 struct r600_context *rctx = (struct r600_context *)ctx;
792 struct r600_surface **resources = (struct r600_surface **)surfaces;
793
794 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
795 start, count);
796
797 for (unsigned i = 0; i < count; i++) {
798 /* The First four vertex buffers are reserved for parameters and
799 * global buffers. */
800 unsigned vtx_id = 4 + i;
801 if (resources[i]) {
802 struct r600_resource_global *buffer =
803 (struct r600_resource_global*)
804 resources[i]->base.texture;
805 if (resources[i]->base.writable) {
806 assert(i+1 < 12);
807
808 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
809 (struct r600_resource *)resources[i]->base.texture,
810 buffer->chunk->start_in_dw*4,
811 resources[i]->base.texture->width0);
812 }
813
814 evergreen_cs_set_vertex_buffer(rctx, vtx_id,
815 buffer->chunk->start_in_dw * 4,
816 resources[i]->base.texture);
817 }
818 }
819 }
820
821 static void evergreen_set_global_binding(struct pipe_context *ctx,
822 unsigned first, unsigned n,
823 struct pipe_resource **resources,
824 uint32_t **handles)
825 {
826 struct r600_context *rctx = (struct r600_context *)ctx;
827 struct compute_memory_pool *pool = rctx->screen->global_pool;
828 struct r600_resource_global **buffers =
829 (struct r600_resource_global **)resources;
830 unsigned i;
831
832 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
833 first, n);
834
835 if (!resources) {
836 /* XXX: Unset */
837 return;
838 }
839
840 /* We mark these items for promotion to the pool if they
841 * aren't already there */
842 for (i = first; i < first + n; i++) {
843 struct compute_memory_item *item = buffers[i]->chunk;
844
845 if (!is_item_in_pool(item))
846 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
847 }
848
849 if (compute_memory_finalize_pending(pool, ctx) == -1) {
850 /* XXX: Unset */
851 return;
852 }
853
854 for (i = first; i < first + n; i++)
855 {
856 uint32_t buffer_offset;
857 uint32_t handle;
858 assert(resources[i]->target == PIPE_BUFFER);
859 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
860
861 buffer_offset = util_le32_to_cpu(*(handles[i]));
862 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
863
864 *(handles[i]) = util_cpu_to_le32(handle);
865 }
866
867 /* globals for writing */
868 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
869 /* globals for reading */
870 evergreen_cs_set_vertex_buffer(rctx, 1, 0,
871 (struct pipe_resource*)pool->bo);
872
873 /* constants for reading, LLVM puts them in text segment */
874 evergreen_cs_set_vertex_buffer(rctx, 2, 0,
875 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
876 }
877
878 /**
879 * This function initializes all the compute specific registers that need to
880 * be initialized for each compute command stream. Registers that are common
881 * to both compute and 3D will be initialized at the beginning of each compute
882 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
883 * packet requires that the shader type bit be set, we must initialize all
884 * context registers needed for compute in this function. The registers
885 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
886 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
887 * on the GPU family.
888 */
889 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
890 {
891 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
892 int num_threads;
893 int num_stack_entries;
894
895 /* since all required registers are initialized in the
896 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
897 */
898 r600_init_command_buffer(cb, 256);
899 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
900
901 /* This must be first. */
902 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
903 r600_store_value(cb, 0x80000000);
904 r600_store_value(cb, 0x80000000);
905
906 /* We're setting config registers here. */
907 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
908 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
909
910 switch (rctx->b.family) {
911 case CHIP_CEDAR:
912 default:
913 num_threads = 128;
914 num_stack_entries = 256;
915 break;
916 case CHIP_REDWOOD:
917 num_threads = 128;
918 num_stack_entries = 256;
919 break;
920 case CHIP_JUNIPER:
921 num_threads = 128;
922 num_stack_entries = 512;
923 break;
924 case CHIP_CYPRESS:
925 case CHIP_HEMLOCK:
926 num_threads = 128;
927 num_stack_entries = 512;
928 break;
929 case CHIP_PALM:
930 num_threads = 128;
931 num_stack_entries = 256;
932 break;
933 case CHIP_SUMO:
934 num_threads = 128;
935 num_stack_entries = 256;
936 break;
937 case CHIP_SUMO2:
938 num_threads = 128;
939 num_stack_entries = 512;
940 break;
941 case CHIP_BARTS:
942 num_threads = 128;
943 num_stack_entries = 512;
944 break;
945 case CHIP_TURKS:
946 num_threads = 128;
947 num_stack_entries = 256;
948 break;
949 case CHIP_CAICOS:
950 num_threads = 128;
951 num_stack_entries = 256;
952 break;
953 }
954
955 /* Config Registers */
956 if (rctx->b.chip_class < CAYMAN)
957 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
958 rctx->screen->b.info.drm_minor);
959 else
960 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
961 rctx->screen->b.info.drm_minor);
962
963 /* The primitive type always needs to be POINTLIST for compute. */
964 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
965 V_008958_DI_PT_POINTLIST);
966
967 if (rctx->b.chip_class < CAYMAN) {
968
969 /* These registers control which simds can be used by each stage.
970 * The default for these registers is 0xffffffff, which means
971 * all simds are available for each stage. It's possible we may
972 * want to play around with these in the future, but for now
973 * the default value is fine.
974 *
975 * R_008E20_SQ_STATIC_THREAD_MGMT1
976 * R_008E24_SQ_STATIC_THREAD_MGMT2
977 * R_008E28_SQ_STATIC_THREAD_MGMT3
978 */
979
980 /* XXX: We may need to adjust the thread and stack resource
981 * values for 3D/compute interop */
982
983 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
984
985 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
986 * Set the number of threads used by the PS/VS/GS/ES stage to
987 * 0.
988 */
989 r600_store_value(cb, 0);
990
991 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
992 * Set the number of threads used by the CS (aka LS) stage to
993 * the maximum number of threads and set the number of threads
994 * for the HS stage to 0. */
995 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
996
997 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
998 * Set the Control Flow stack entries to 0 for PS/VS stages */
999 r600_store_value(cb, 0);
1000
1001 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1002 * Set the Control Flow stack entries to 0 for GS/ES stages */
1003 r600_store_value(cb, 0);
1004
1005 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1006 * Set the Contol Flow stack entries to 0 for the HS stage, and
1007 * set it to the maximum value for the CS (aka LS) stage. */
1008 r600_store_value(cb,
1009 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1010 }
1011 /* Give the compute shader all the available LDS space.
1012 * NOTE: This only sets the maximum number of dwords that a compute
1013 * shader can allocate. When a shader is executed, we still need to
1014 * allocate the appropriate amount of LDS dwords using the
1015 * CM_R_0288E8_SQ_LDS_ALLOC register.
1016 */
1017 if (rctx->b.chip_class < CAYMAN) {
1018 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1019 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1020 } else {
1021 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1022 S_0286FC_NUM_PS_LDS(0) |
1023 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1024 }
1025
1026 /* Context Registers */
1027
1028 if (rctx->b.chip_class < CAYMAN) {
1029 /* workaround for hw issues with dyn gpr - must set all limits
1030 * to 240 instead of 0, 0x1e == 240 / 8
1031 */
1032 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1033 S_028838_PS_GPRS(0x1e) |
1034 S_028838_VS_GPRS(0x1e) |
1035 S_028838_GS_GPRS(0x1e) |
1036 S_028838_ES_GPRS(0x1e) |
1037 S_028838_HS_GPRS(0x1e) |
1038 S_028838_LS_GPRS(0x1e));
1039 }
1040
1041 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1042 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1043 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1044
1045 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1046
1047 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1048 S_0286E8_TID_IN_GROUP_ENA(1) |
1049 S_0286E8_TGID_ENA(1) |
1050 S_0286E8_DISABLE_INDEX_PACK(1));
1051
1052 /* The LOOP_CONST registers are an optimizations for loops that allows
1053 * you to store the initial counter, increment value, and maximum
1054 * counter value in a register so that hardware can calculate the
1055 * correct number of iterations for the loop, so that you don't need
1056 * to have the loop counter in your shader code. We don't currently use
1057 * this optimization, so we must keep track of the counter in the
1058 * shader and use a break instruction to exit loops. However, the
1059 * hardware will still uses this register to determine when to exit a
1060 * loop, so we need to initialize the counter to 0, set the increment
1061 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1062 * is the maximum value allowed. This gives us a maximum of 4096
1063 * iterations for our loops, but hopefully our break instruction will
1064 * execute before some time before the 4096th iteration.
1065 */
1066 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1067 }
1068
1069 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1070 {
1071 rctx->b.b.create_compute_state = evergreen_create_compute_state;
1072 rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1073 rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1074 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1075 rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1076 rctx->b.b.set_global_binding = evergreen_set_global_binding;
1077 rctx->b.b.launch_grid = evergreen_launch_grid;
1078
1079 }
1080
1081 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1082 struct pipe_resource *resource,
1083 unsigned level,
1084 unsigned usage,
1085 const struct pipe_box *box,
1086 struct pipe_transfer **ptransfer)
1087 {
1088 struct r600_context *rctx = (struct r600_context*)ctx;
1089 struct compute_memory_pool *pool = rctx->screen->global_pool;
1090 struct r600_resource_global* buffer =
1091 (struct r600_resource_global*)resource;
1092
1093 struct compute_memory_item *item = buffer->chunk;
1094 struct pipe_resource *dst = NULL;
1095 unsigned offset = box->x;
1096
1097 if (is_item_in_pool(item)) {
1098 compute_memory_demote_item(pool, item, ctx);
1099 }
1100 else {
1101 if (item->real_buffer == NULL) {
1102 item->real_buffer =
1103 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1104 }
1105 }
1106
1107 dst = (struct pipe_resource*)item->real_buffer;
1108
1109 if (usage & PIPE_TRANSFER_READ)
1110 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1111
1112 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1113 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1114 "width = %u, height = %u, depth = %u)\n", level, usage,
1115 box->x, box->y, box->z, box->width, box->height,
1116 box->depth);
1117 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1118 "%u (box.x)\n", item->id, box->x);
1119
1120
1121 assert(resource->target == PIPE_BUFFER);
1122 assert(resource->bind & PIPE_BIND_GLOBAL);
1123 assert(box->x >= 0);
1124 assert(box->y == 0);
1125 assert(box->z == 0);
1126
1127 ///TODO: do it better, mapping is not possible if the pool is too big
1128 return pipe_buffer_map_range(ctx, dst,
1129 offset, box->width, usage, ptransfer);
1130 }
1131
1132 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1133 struct pipe_transfer *transfer)
1134 {
1135 /* struct r600_resource_global are not real resources, they just map
1136 * to an offset within the compute memory pool. The function
1137 * r600_compute_global_transfer_map() maps the memory pool
1138 * resource rather than the struct r600_resource_global passed to
1139 * it as an argument and then initalizes ptransfer->resource with
1140 * the memory pool resource (via pipe_buffer_map_range).
1141 * When transfer_unmap is called it uses the memory pool's
1142 * vtable which calls r600_buffer_transfer_map() rather than
1143 * this function.
1144 */
1145 assert (!"This function should not be called");
1146 }
1147
1148 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1149 struct pipe_transfer *transfer,
1150 const struct pipe_box *box)
1151 {
1152 assert(0 && "TODO");
1153 }
1154
1155 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1156 struct pipe_resource *res)
1157 {
1158 struct r600_resource_global* buffer = NULL;
1159 struct r600_screen* rscreen = NULL;
1160
1161 assert(res->target == PIPE_BUFFER);
1162 assert(res->bind & PIPE_BIND_GLOBAL);
1163
1164 buffer = (struct r600_resource_global*)res;
1165 rscreen = (struct r600_screen*)screen;
1166
1167 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1168
1169 buffer->chunk = NULL;
1170 free(res);
1171 }
1172
1173 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1174 {
1175 u_default_resource_get_handle, /* get_handle */
1176 r600_compute_global_buffer_destroy, /* resource_destroy */
1177 r600_compute_global_transfer_map, /* transfer_map */
1178 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1179 r600_compute_global_transfer_unmap, /* transfer_unmap */
1180 };
1181
1182 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1183 const struct pipe_resource *templ)
1184 {
1185 struct r600_resource_global* result = NULL;
1186 struct r600_screen* rscreen = NULL;
1187 int size_in_dw = 0;
1188
1189 assert(templ->target == PIPE_BUFFER);
1190 assert(templ->bind & PIPE_BIND_GLOBAL);
1191 assert(templ->array_size == 1 || templ->array_size == 0);
1192 assert(templ->depth0 == 1 || templ->depth0 == 0);
1193 assert(templ->height0 == 1 || templ->height0 == 0);
1194
1195 result = (struct r600_resource_global*)
1196 CALLOC(sizeof(struct r600_resource_global), 1);
1197 rscreen = (struct r600_screen*)screen;
1198
1199 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1200 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1201 templ->array_size);
1202
1203 result->base.b.vtbl = &r600_global_buffer_vtbl;
1204 result->base.b.b = *templ;
1205 result->base.b.b.screen = screen;
1206 pipe_reference_init(&result->base.b.b.reference, 1);
1207
1208 size_in_dw = (templ->width0+3) / 4;
1209
1210 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1211
1212 if (result->chunk == NULL)
1213 {
1214 free(result);
1215 return NULL;
1216 }
1217
1218 return &result->base.b.b;
1219 }