ac: change legacy_surf_level::slice_size to dword units
[mesa.git] / src / gallium / drivers / r600 / evergreen_compute.c
1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #ifdef HAVE_OPENCL
28 #include <gelf.h>
29 #include <libelf.h>
30 #endif
31 #include <stdio.h>
32 #include <errno.h>
33 #include "pipe/p_defines.h"
34 #include "pipe/p_state.h"
35 #include "pipe/p_context.h"
36 #include "util/u_blitter.h"
37 #include "util/list.h"
38 #include "util/u_transfer.h"
39 #include "util/u_surface.h"
40 #include "util/u_pack_color.h"
41 #include "util/u_memory.h"
42 #include "util/u_inlines.h"
43 #include "util/u_framebuffer.h"
44 #include "pipebuffer/pb_buffer.h"
45 #include "evergreend.h"
46 #include "r600_shader.h"
47 #include "r600_pipe.h"
48 #include "r600_formats.h"
49 #include "evergreen_compute.h"
50 #include "evergreen_compute_internal.h"
51 #include "compute_memory_pool.h"
52 #include "sb/sb_public.h"
53 #include <inttypes.h>
54
55 /**
56 RAT0 is for global binding write
57 VTX1 is for global binding read
58
59 for wrting images RAT1...
60 for reading images TEX2...
61 TEX2-RAT1 is paired
62
63 TEX2... consumes the same fetch resources, that VTX2... would consume
64
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
67 also constant cached
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
70
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
74
75 from Nvidia OpenCL:
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
78
79 so 10 for writing is enough. 176 is the max for reading according to the docs
80
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
83
84 */
85
86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
87 unsigned size)
88 {
89 struct pipe_resource *buffer = NULL;
90 assert(size);
91
92 buffer = pipe_buffer_create((struct pipe_screen*) screen,
93 0, PIPE_USAGE_IMMUTABLE, size);
94
95 return (struct r600_resource *)buffer;
96 }
97
98
99 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
100 unsigned id,
101 struct r600_resource *bo,
102 int start,
103 int size)
104 {
105 struct pipe_surface rat_templ;
106 struct r600_surface *surf = NULL;
107 struct r600_context *rctx = NULL;
108
109 assert(id < 12);
110 assert((size & 3) == 0);
111 assert((start & 0xFF) == 0);
112
113 rctx = pipe->ctx;
114
115 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
116
117 /* Create the RAT surface */
118 memset(&rat_templ, 0, sizeof(rat_templ));
119 rat_templ.format = PIPE_FORMAT_R32_UINT;
120 rat_templ.u.tex.level = 0;
121 rat_templ.u.tex.first_layer = 0;
122 rat_templ.u.tex.last_layer = 0;
123
124 /* Add the RAT the list of color buffers */
125 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
126 (struct pipe_context *)pipe->ctx,
127 (struct pipe_resource *)bo, &rat_templ);
128
129 /* Update the number of color buffers */
130 pipe->ctx->framebuffer.state.nr_cbufs =
131 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
132
133 /* Update the cb_target_mask
134 * XXX: I think this is a potential spot for bugs once we start doing
135 * GL interop. cb_target_mask may be modified in the 3D sections
136 * of this driver. */
137 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
138
139 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
140 evergreen_init_color_surface_rat(rctx, surf);
141 }
142
143 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
144 unsigned vb_index,
145 unsigned offset,
146 struct pipe_resource *buffer)
147 {
148 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
149 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
150 vb->stride = 1;
151 vb->buffer_offset = offset;
152 vb->buffer.resource = buffer;
153 vb->is_user_buffer = false;
154
155 /* The vertex instructions in the compute shaders use the texture cache,
156 * so we need to invalidate it. */
157 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
158 state->enabled_mask |= 1 << vb_index;
159 state->dirty_mask |= 1 << vb_index;
160 r600_mark_atom_dirty(rctx, &state->atom);
161 }
162
163 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
164 unsigned cb_index,
165 unsigned offset,
166 unsigned size,
167 struct pipe_resource *buffer)
168 {
169 struct pipe_constant_buffer cb;
170 cb.buffer_size = size;
171 cb.buffer_offset = offset;
172 cb.buffer = buffer;
173 cb.user_buffer = NULL;
174
175 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
176 }
177
178 /* We need to define these R600 registers here, because we can't include
179 * evergreend.h and r600d.h.
180 */
181 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
182 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
183
184 #ifdef HAVE_OPENCL
185 static void parse_symbol_table(Elf_Data *symbol_table_data,
186 const GElf_Shdr *symbol_table_header,
187 struct ac_shader_binary *binary)
188 {
189 GElf_Sym symbol;
190 unsigned i = 0;
191 unsigned symbol_count =
192 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
193
194 /* We are over allocating this list, because symbol_count gives the
195 * total number of symbols, and we will only be filling the list
196 * with offsets of global symbols. The memory savings from
197 * allocating the correct size of this list will be small, and
198 * I don't think it is worth the cost of pre-computing the number
199 * of global symbols.
200 */
201 binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
202
203 while (gelf_getsym(symbol_table_data, i++, &symbol)) {
204 unsigned i;
205 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
206 symbol.st_shndx == 0 /* Undefined symbol */) {
207 continue;
208 }
209
210 binary->global_symbol_offsets[binary->global_symbol_count] =
211 symbol.st_value;
212
213 /* Sort the list using bubble sort. This list will usually
214 * be small. */
215 for (i = binary->global_symbol_count; i > 0; --i) {
216 uint64_t lhs = binary->global_symbol_offsets[i - 1];
217 uint64_t rhs = binary->global_symbol_offsets[i];
218 if (lhs < rhs) {
219 break;
220 }
221 binary->global_symbol_offsets[i] = lhs;
222 binary->global_symbol_offsets[i - 1] = rhs;
223 }
224 ++binary->global_symbol_count;
225 }
226 }
227
228
229 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
230 unsigned symbol_sh_link,
231 struct ac_shader_binary *binary)
232 {
233 unsigned i;
234
235 if (!relocs || !symbols || !binary->reloc_count) {
236 return;
237 }
238 binary->relocs = CALLOC(binary->reloc_count,
239 sizeof(struct ac_shader_reloc));
240 for (i = 0; i < binary->reloc_count; i++) {
241 GElf_Sym symbol;
242 GElf_Rel rel;
243 char *symbol_name;
244 struct ac_shader_reloc *reloc = &binary->relocs[i];
245
246 gelf_getrel(relocs, i, &rel);
247 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
248 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
249
250 reloc->offset = rel.r_offset;
251 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
252 reloc->name[sizeof(reloc->name)-1] = 0;
253 }
254 }
255
256 static void r600_elf_read(const char *elf_data, unsigned elf_size,
257 struct ac_shader_binary *binary)
258 {
259 char *elf_buffer;
260 Elf *elf;
261 Elf_Scn *section = NULL;
262 Elf_Data *symbols = NULL, *relocs = NULL;
263 size_t section_str_index;
264 unsigned symbol_sh_link = 0;
265
266 /* One of the libelf implementations
267 * (http://www.mr511.de/software/english.htm) requires calling
268 * elf_version() before elf_memory().
269 */
270 elf_version(EV_CURRENT);
271 elf_buffer = MALLOC(elf_size);
272 memcpy(elf_buffer, elf_data, elf_size);
273
274 elf = elf_memory(elf_buffer, elf_size);
275
276 elf_getshdrstrndx(elf, &section_str_index);
277
278 while ((section = elf_nextscn(elf, section))) {
279 const char *name;
280 Elf_Data *section_data = NULL;
281 GElf_Shdr section_header;
282 if (gelf_getshdr(section, &section_header) != &section_header) {
283 fprintf(stderr, "Failed to read ELF section header\n");
284 return;
285 }
286 name = elf_strptr(elf, section_str_index, section_header.sh_name);
287 if (!strcmp(name, ".text")) {
288 section_data = elf_getdata(section, section_data);
289 binary->code_size = section_data->d_size;
290 binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
291 memcpy(binary->code, section_data->d_buf, binary->code_size);
292 } else if (!strcmp(name, ".AMDGPU.config")) {
293 section_data = elf_getdata(section, section_data);
294 binary->config_size = section_data->d_size;
295 binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
296 memcpy(binary->config, section_data->d_buf, binary->config_size);
297 } else if (!strcmp(name, ".AMDGPU.disasm")) {
298 /* Always read disassembly if it's available. */
299 section_data = elf_getdata(section, section_data);
300 binary->disasm_string = strndup(section_data->d_buf,
301 section_data->d_size);
302 } else if (!strncmp(name, ".rodata", 7)) {
303 section_data = elf_getdata(section, section_data);
304 binary->rodata_size = section_data->d_size;
305 binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
306 memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
307 } else if (!strncmp(name, ".symtab", 7)) {
308 symbols = elf_getdata(section, section_data);
309 symbol_sh_link = section_header.sh_link;
310 parse_symbol_table(symbols, &section_header, binary);
311 } else if (!strcmp(name, ".rel.text")) {
312 relocs = elf_getdata(section, section_data);
313 binary->reloc_count = section_header.sh_size /
314 section_header.sh_entsize;
315 }
316 }
317
318 parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
319
320 if (elf){
321 elf_end(elf);
322 }
323 FREE(elf_buffer);
324
325 /* Cache the config size per symbol */
326 if (binary->global_symbol_count) {
327 binary->config_size_per_symbol =
328 binary->config_size / binary->global_symbol_count;
329 } else {
330 binary->global_symbol_count = 1;
331 binary->config_size_per_symbol = binary->config_size;
332 }
333 }
334
335 static const unsigned char *r600_shader_binary_config_start(
336 const struct ac_shader_binary *binary,
337 uint64_t symbol_offset)
338 {
339 unsigned i;
340 for (i = 0; i < binary->global_symbol_count; ++i) {
341 if (binary->global_symbol_offsets[i] == symbol_offset) {
342 unsigned offset = i * binary->config_size_per_symbol;
343 return binary->config + offset;
344 }
345 }
346 return binary->config;
347 }
348
349 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
350 struct r600_bytecode *bc,
351 uint64_t symbol_offset,
352 boolean *use_kill)
353 {
354 unsigned i;
355 const unsigned char *config =
356 r600_shader_binary_config_start(binary, symbol_offset);
357
358 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
359 unsigned reg =
360 util_le32_to_cpu(*(uint32_t*)(config + i));
361 unsigned value =
362 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
363 switch (reg) {
364 /* R600 / R700 */
365 case R_028850_SQ_PGM_RESOURCES_PS:
366 case R_028868_SQ_PGM_RESOURCES_VS:
367 /* Evergreen / Northern Islands */
368 case R_028844_SQ_PGM_RESOURCES_PS:
369 case R_028860_SQ_PGM_RESOURCES_VS:
370 case R_0288D4_SQ_PGM_RESOURCES_LS:
371 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
372 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
373 break;
374 case R_02880C_DB_SHADER_CONTROL:
375 *use_kill = G_02880C_KILL_ENABLE(value);
376 break;
377 case R_0288E8_SQ_LDS_ALLOC:
378 bc->nlds_dw = value;
379 break;
380 }
381 }
382 }
383
384 static unsigned r600_create_shader(struct r600_bytecode *bc,
385 const struct ac_shader_binary *binary,
386 boolean *use_kill)
387
388 {
389 assert(binary->code_size % 4 == 0);
390 bc->bytecode = CALLOC(1, binary->code_size);
391 memcpy(bc->bytecode, binary->code, binary->code_size);
392 bc->ndw = binary->code_size / 4;
393
394 r600_shader_binary_read_config(binary, bc, 0, use_kill);
395 return 0;
396 }
397
398 #endif
399
400 static void r600_destroy_shader(struct r600_bytecode *bc)
401 {
402 FREE(bc->bytecode);
403 }
404
405 static void *evergreen_create_compute_state(struct pipe_context *ctx,
406 const struct pipe_compute_state *cso)
407 {
408 struct r600_context *rctx = (struct r600_context *)ctx;
409 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
410 #ifdef HAVE_OPENCL
411 const struct pipe_llvm_program_header *header;
412 const char *code;
413 void *p;
414 boolean use_kill;
415
416 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
417 header = cso->prog;
418 code = cso->prog + sizeof(struct pipe_llvm_program_header);
419 radeon_shader_binary_init(&shader->binary);
420 r600_elf_read(code, header->num_bytes, &shader->binary);
421 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
422
423 /* Upload code + ROdata */
424 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
425 shader->bc.ndw * 4);
426 p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
427 //TODO: use util_memcpy_cpu_to_le32 ?
428 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
429 rctx->b.ws->buffer_unmap(shader->code_bo->buf);
430 #endif
431
432 shader->ctx = rctx;
433 shader->local_size = cso->req_local_mem;
434 shader->private_size = cso->req_private_mem;
435 shader->input_size = cso->req_input_mem;
436
437 return shader;
438 }
439
440 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
441 {
442 struct r600_context *rctx = (struct r600_context *)ctx;
443 struct r600_pipe_compute *shader = state;
444
445 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
446
447 if (!shader)
448 return;
449
450 #ifdef HAVE_OPENCL
451 radeon_shader_binary_clean(&shader->binary);
452 #endif
453 r600_destroy_shader(&shader->bc);
454
455 /* TODO destroy shader->code_bo, shader->const_bo
456 * we'll need something like r600_buffer_free */
457 FREE(shader);
458 }
459
460 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
461 {
462 struct r600_context *rctx = (struct r600_context *)ctx;
463
464 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
465
466 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
467 }
468
469 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
470 * kernel parameters there are implicit parameters that need to be stored
471 * in the vertex buffer as well. Here is how these parameters are organized in
472 * the buffer:
473 *
474 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
475 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
476 * DWORDS 6-8: Number of work items within each work group in each dimension
477 * (x,y,z)
478 * DWORDS 9+ : Kernel parameters
479 */
480 static void evergreen_compute_upload_input(struct pipe_context *ctx,
481 const struct pipe_grid_info *info)
482 {
483 struct r600_context *rctx = (struct r600_context *)ctx;
484 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
485 unsigned i;
486 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
487 * parameters.
488 */
489 unsigned input_size = shader->input_size + 36;
490 uint32_t *num_work_groups_start;
491 uint32_t *global_size_start;
492 uint32_t *local_size_start;
493 uint32_t *kernel_parameters_start;
494 struct pipe_box box;
495 struct pipe_transfer *transfer = NULL;
496
497 if (shader->input_size == 0) {
498 return;
499 }
500
501 if (!shader->kernel_param) {
502 /* Add space for the grid dimensions */
503 shader->kernel_param = (struct r600_resource *)
504 pipe_buffer_create(ctx->screen, 0,
505 PIPE_USAGE_IMMUTABLE, input_size);
506 }
507
508 u_box_1d(0, input_size, &box);
509 num_work_groups_start = ctx->transfer_map(ctx,
510 (struct pipe_resource*)shader->kernel_param,
511 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
512 &box, &transfer);
513 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
514 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
515 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
516
517 /* Copy the work group size */
518 memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
519
520 /* Copy the global size */
521 for (i = 0; i < 3; i++) {
522 global_size_start[i] = info->grid[i] * info->block[i];
523 }
524
525 /* Copy the local dimensions */
526 memcpy(local_size_start, info->block, 3 * sizeof(uint));
527
528 /* Copy the kernel inputs */
529 memcpy(kernel_parameters_start, info->input, shader->input_size);
530
531 for (i = 0; i < (input_size / 4); i++) {
532 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
533 ((unsigned*)num_work_groups_start)[i]);
534 }
535
536 ctx->transfer_unmap(ctx, transfer);
537
538 /* ID=0 and ID=3 are reserved for the parameters.
539 * LLVM will preferably use ID=0, but it does not work for dynamic
540 * indices. */
541 evergreen_cs_set_vertex_buffer(rctx, 3, 0,
542 (struct pipe_resource*)shader->kernel_param);
543 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
544 (struct pipe_resource*)shader->kernel_param);
545 }
546
547 static void evergreen_emit_dispatch(struct r600_context *rctx,
548 const struct pipe_grid_info *info)
549 {
550 int i;
551 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
552 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
553 unsigned num_waves;
554 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
555 unsigned wave_divisor = (16 * num_pipes);
556 int group_size = 1;
557 int grid_size = 1;
558 unsigned lds_size = shader->local_size / 4 +
559 shader->bc.nlds_dw;
560
561
562 /* Calculate group_size/grid_size */
563 for (i = 0; i < 3; i++) {
564 group_size *= info->block[i];
565 }
566
567 for (i = 0; i < 3; i++) {
568 grid_size *= info->grid[i];
569 }
570
571 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
572 num_waves = (info->block[0] * info->block[1] * info->block[2] +
573 wave_divisor - 1) / wave_divisor;
574
575 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
576 "%u wavefronts per thread block, "
577 "allocating %u dwords lds.\n",
578 num_pipes, num_waves, lds_size);
579
580 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
581
582 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
583 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
584 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
585 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
586
587 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
588 group_size);
589
590 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
591 radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
592 radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
593 radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
594
595 if (rctx->b.chip_class < CAYMAN) {
596 assert(lds_size <= 8192);
597 } else {
598 /* Cayman appears to have a slightly smaller limit, see the
599 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
600 assert(lds_size <= 8160);
601 }
602
603 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
604 lds_size | (num_waves << 14));
605
606 /* Dispatch packet */
607 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
608 radeon_emit(cs, info->grid[0]);
609 radeon_emit(cs, info->grid[1]);
610 radeon_emit(cs, info->grid[2]);
611 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
612 radeon_emit(cs, 1);
613
614 if (rctx->is_debug)
615 eg_trace_emit(rctx);
616 }
617
618 static void compute_emit_cs(struct r600_context *rctx,
619 const struct pipe_grid_info *info)
620 {
621 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
622 unsigned i;
623
624 /* make sure that the gfx ring is only one active */
625 if (radeon_emitted(rctx->b.dma.cs, 0)) {
626 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
627 }
628
629 /* Initialize all the compute-related registers.
630 *
631 * See evergreen_init_atom_start_compute_cs() in this file for the list
632 * of registers initialized by the start_compute_cs_cmd atom.
633 */
634 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
635
636 /* emit config state */
637 if (rctx->b.chip_class == EVERGREEN)
638 r600_emit_atom(rctx, &rctx->config_state.atom);
639
640 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
641 r600_flush_emit(rctx);
642
643 /* Emit colorbuffers. */
644 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
645 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
646 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
647 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
648 (struct r600_resource*)cb->base.texture,
649 RADEON_USAGE_READWRITE,
650 RADEON_PRIO_SHADER_RW_BUFFER);
651
652 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
653 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
654 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
655 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
656 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
657 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
658 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
659 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
660
661 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
662 radeon_emit(cs, reloc);
663
664 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
665 radeon_emit(cs, reloc);
666 }
667 for (; i < 8 ; i++)
668 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
669 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
670 for (; i < 12; i++)
671 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
672 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
673
674 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
675 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
676 rctx->compute_cb_target_mask);
677
678
679 /* Emit vertex buffer state */
680 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
681 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
682
683 /* Emit constant buffer state */
684 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
685
686 /* Emit sampler state */
687 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
688
689 /* Emit sampler view (texture resource) state */
690 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
691
692 /* Emit compute shader state */
693 r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
694
695 /* Emit dispatch state and dispatch packet */
696 evergreen_emit_dispatch(rctx, info);
697
698 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
699 */
700 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
701 R600_CONTEXT_INV_VERTEX_CACHE |
702 R600_CONTEXT_INV_TEX_CACHE;
703 r600_flush_emit(rctx);
704 rctx->b.flags = 0;
705
706 if (rctx->b.chip_class >= CAYMAN) {
707 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
708 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
709 /* DEALLOC_STATE prevents the GPU from hanging when a
710 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
711 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
712 */
713 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
714 radeon_emit(cs, 0);
715 }
716
717 #if 0
718 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
719 for (i = 0; i < cs->cdw; i++) {
720 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
721 }
722 #endif
723
724 }
725
726
727 /**
728 * Emit function for r600_cs_shader_state atom
729 */
730 void evergreen_emit_cs_shader(struct r600_context *rctx,
731 struct r600_atom *atom)
732 {
733 struct r600_cs_shader_state *state =
734 (struct r600_cs_shader_state*)atom;
735 struct r600_pipe_compute *shader = state->shader;
736 struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
737 uint64_t va;
738 struct r600_resource *code_bo;
739 unsigned ngpr, nstack;
740
741 code_bo = shader->code_bo;
742 va = shader->code_bo->gpu_address + state->pc;
743 ngpr = shader->bc.ngpr;
744 nstack = shader->bc.nstack;
745
746 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
747 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
748 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
749 S_0288D4_NUM_GPRS(ngpr) |
750 S_0288D4_DX10_CLAMP(1) |
751 S_0288D4_STACK_SIZE(nstack));
752 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
753
754 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
755 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
756 code_bo, RADEON_USAGE_READ,
757 RADEON_PRIO_SHADER_BINARY));
758 }
759
760 static void evergreen_launch_grid(struct pipe_context *ctx,
761 const struct pipe_grid_info *info)
762 {
763 struct r600_context *rctx = (struct r600_context *)ctx;
764 #ifdef HAVE_OPENCL
765 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
766 boolean use_kill;
767
768 rctx->cs_shader_state.pc = info->pc;
769 /* Get the config information for this kernel. */
770 r600_shader_binary_read_config(&shader->binary, &shader->bc,
771 info->pc, &use_kill);
772 #endif
773
774 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
775
776
777 evergreen_compute_upload_input(ctx, info);
778 compute_emit_cs(rctx, info);
779 }
780
781 static void evergreen_set_compute_resources(struct pipe_context *ctx,
782 unsigned start, unsigned count,
783 struct pipe_surface **surfaces)
784 {
785 struct r600_context *rctx = (struct r600_context *)ctx;
786 struct r600_surface **resources = (struct r600_surface **)surfaces;
787
788 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
789 start, count);
790
791 for (unsigned i = 0; i < count; i++) {
792 /* The First four vertex buffers are reserved for parameters and
793 * global buffers. */
794 unsigned vtx_id = 4 + i;
795 if (resources[i]) {
796 struct r600_resource_global *buffer =
797 (struct r600_resource_global*)
798 resources[i]->base.texture;
799 if (resources[i]->base.writable) {
800 assert(i+1 < 12);
801
802 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
803 (struct r600_resource *)resources[i]->base.texture,
804 buffer->chunk->start_in_dw*4,
805 resources[i]->base.texture->width0);
806 }
807
808 evergreen_cs_set_vertex_buffer(rctx, vtx_id,
809 buffer->chunk->start_in_dw * 4,
810 resources[i]->base.texture);
811 }
812 }
813 }
814
815 static void evergreen_set_global_binding(struct pipe_context *ctx,
816 unsigned first, unsigned n,
817 struct pipe_resource **resources,
818 uint32_t **handles)
819 {
820 struct r600_context *rctx = (struct r600_context *)ctx;
821 struct compute_memory_pool *pool = rctx->screen->global_pool;
822 struct r600_resource_global **buffers =
823 (struct r600_resource_global **)resources;
824 unsigned i;
825
826 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
827 first, n);
828
829 if (!resources) {
830 /* XXX: Unset */
831 return;
832 }
833
834 /* We mark these items for promotion to the pool if they
835 * aren't already there */
836 for (i = first; i < first + n; i++) {
837 struct compute_memory_item *item = buffers[i]->chunk;
838
839 if (!is_item_in_pool(item))
840 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
841 }
842
843 if (compute_memory_finalize_pending(pool, ctx) == -1) {
844 /* XXX: Unset */
845 return;
846 }
847
848 for (i = first; i < first + n; i++)
849 {
850 uint32_t buffer_offset;
851 uint32_t handle;
852 assert(resources[i]->target == PIPE_BUFFER);
853 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
854
855 buffer_offset = util_le32_to_cpu(*(handles[i]));
856 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
857
858 *(handles[i]) = util_cpu_to_le32(handle);
859 }
860
861 /* globals for writing */
862 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
863 /* globals for reading */
864 evergreen_cs_set_vertex_buffer(rctx, 1, 0,
865 (struct pipe_resource*)pool->bo);
866
867 /* constants for reading, LLVM puts them in text segment */
868 evergreen_cs_set_vertex_buffer(rctx, 2, 0,
869 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
870 }
871
872 /**
873 * This function initializes all the compute specific registers that need to
874 * be initialized for each compute command stream. Registers that are common
875 * to both compute and 3D will be initialized at the beginning of each compute
876 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
877 * packet requires that the shader type bit be set, we must initialize all
878 * context registers needed for compute in this function. The registers
879 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
880 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
881 * on the GPU family.
882 */
883 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
884 {
885 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
886 int num_threads;
887 int num_stack_entries;
888
889 /* since all required registers are initialized in the
890 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
891 */
892 r600_init_command_buffer(cb, 256);
893 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
894
895 /* This must be first. */
896 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
897 r600_store_value(cb, 0x80000000);
898 r600_store_value(cb, 0x80000000);
899
900 /* We're setting config registers here. */
901 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
902 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
903
904 switch (rctx->b.family) {
905 case CHIP_CEDAR:
906 default:
907 num_threads = 128;
908 num_stack_entries = 256;
909 break;
910 case CHIP_REDWOOD:
911 num_threads = 128;
912 num_stack_entries = 256;
913 break;
914 case CHIP_JUNIPER:
915 num_threads = 128;
916 num_stack_entries = 512;
917 break;
918 case CHIP_CYPRESS:
919 case CHIP_HEMLOCK:
920 num_threads = 128;
921 num_stack_entries = 512;
922 break;
923 case CHIP_PALM:
924 num_threads = 128;
925 num_stack_entries = 256;
926 break;
927 case CHIP_SUMO:
928 num_threads = 128;
929 num_stack_entries = 256;
930 break;
931 case CHIP_SUMO2:
932 num_threads = 128;
933 num_stack_entries = 512;
934 break;
935 case CHIP_BARTS:
936 num_threads = 128;
937 num_stack_entries = 512;
938 break;
939 case CHIP_TURKS:
940 num_threads = 128;
941 num_stack_entries = 256;
942 break;
943 case CHIP_CAICOS:
944 num_threads = 128;
945 num_stack_entries = 256;
946 break;
947 }
948
949 /* Config Registers */
950 if (rctx->b.chip_class < CAYMAN)
951 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
952 rctx->screen->b.info.drm_minor);
953 else
954 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
955 rctx->screen->b.info.drm_minor);
956
957 /* The primitive type always needs to be POINTLIST for compute. */
958 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
959 V_008958_DI_PT_POINTLIST);
960
961 if (rctx->b.chip_class < CAYMAN) {
962
963 /* These registers control which simds can be used by each stage.
964 * The default for these registers is 0xffffffff, which means
965 * all simds are available for each stage. It's possible we may
966 * want to play around with these in the future, but for now
967 * the default value is fine.
968 *
969 * R_008E20_SQ_STATIC_THREAD_MGMT1
970 * R_008E24_SQ_STATIC_THREAD_MGMT2
971 * R_008E28_SQ_STATIC_THREAD_MGMT3
972 */
973
974 /* XXX: We may need to adjust the thread and stack resource
975 * values for 3D/compute interop */
976
977 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
978
979 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
980 * Set the number of threads used by the PS/VS/GS/ES stage to
981 * 0.
982 */
983 r600_store_value(cb, 0);
984
985 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
986 * Set the number of threads used by the CS (aka LS) stage to
987 * the maximum number of threads and set the number of threads
988 * for the HS stage to 0. */
989 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
990
991 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
992 * Set the Control Flow stack entries to 0 for PS/VS stages */
993 r600_store_value(cb, 0);
994
995 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
996 * Set the Control Flow stack entries to 0 for GS/ES stages */
997 r600_store_value(cb, 0);
998
999 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1000 * Set the Contol Flow stack entries to 0 for the HS stage, and
1001 * set it to the maximum value for the CS (aka LS) stage. */
1002 r600_store_value(cb,
1003 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1004 }
1005 /* Give the compute shader all the available LDS space.
1006 * NOTE: This only sets the maximum number of dwords that a compute
1007 * shader can allocate. When a shader is executed, we still need to
1008 * allocate the appropriate amount of LDS dwords using the
1009 * CM_R_0288E8_SQ_LDS_ALLOC register.
1010 */
1011 if (rctx->b.chip_class < CAYMAN) {
1012 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1013 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1014 } else {
1015 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1016 S_0286FC_NUM_PS_LDS(0) |
1017 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1018 }
1019
1020 /* Context Registers */
1021
1022 if (rctx->b.chip_class < CAYMAN) {
1023 /* workaround for hw issues with dyn gpr - must set all limits
1024 * to 240 instead of 0, 0x1e == 240 / 8
1025 */
1026 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1027 S_028838_PS_GPRS(0x1e) |
1028 S_028838_VS_GPRS(0x1e) |
1029 S_028838_GS_GPRS(0x1e) |
1030 S_028838_ES_GPRS(0x1e) |
1031 S_028838_HS_GPRS(0x1e) |
1032 S_028838_LS_GPRS(0x1e));
1033 }
1034
1035 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1036 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1037 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1038
1039 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1040
1041 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1042 S_0286E8_TID_IN_GROUP_ENA(1) |
1043 S_0286E8_TGID_ENA(1) |
1044 S_0286E8_DISABLE_INDEX_PACK(1));
1045
1046 /* The LOOP_CONST registers are an optimizations for loops that allows
1047 * you to store the initial counter, increment value, and maximum
1048 * counter value in a register so that hardware can calculate the
1049 * correct number of iterations for the loop, so that you don't need
1050 * to have the loop counter in your shader code. We don't currently use
1051 * this optimization, so we must keep track of the counter in the
1052 * shader and use a break instruction to exit loops. However, the
1053 * hardware will still uses this register to determine when to exit a
1054 * loop, so we need to initialize the counter to 0, set the increment
1055 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1056 * is the maximum value allowed. This gives us a maximum of 4096
1057 * iterations for our loops, but hopefully our break instruction will
1058 * execute before some time before the 4096th iteration.
1059 */
1060 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1061 }
1062
1063 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1064 {
1065 rctx->b.b.create_compute_state = evergreen_create_compute_state;
1066 rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1067 rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1068 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1069 rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1070 rctx->b.b.set_global_binding = evergreen_set_global_binding;
1071 rctx->b.b.launch_grid = evergreen_launch_grid;
1072
1073 }
1074
1075 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1076 struct pipe_resource *resource,
1077 unsigned level,
1078 unsigned usage,
1079 const struct pipe_box *box,
1080 struct pipe_transfer **ptransfer)
1081 {
1082 struct r600_context *rctx = (struct r600_context*)ctx;
1083 struct compute_memory_pool *pool = rctx->screen->global_pool;
1084 struct r600_resource_global* buffer =
1085 (struct r600_resource_global*)resource;
1086
1087 struct compute_memory_item *item = buffer->chunk;
1088 struct pipe_resource *dst = NULL;
1089 unsigned offset = box->x;
1090
1091 if (is_item_in_pool(item)) {
1092 compute_memory_demote_item(pool, item, ctx);
1093 }
1094 else {
1095 if (item->real_buffer == NULL) {
1096 item->real_buffer =
1097 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1098 }
1099 }
1100
1101 dst = (struct pipe_resource*)item->real_buffer;
1102
1103 if (usage & PIPE_TRANSFER_READ)
1104 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1105
1106 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1107 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1108 "width = %u, height = %u, depth = %u)\n", level, usage,
1109 box->x, box->y, box->z, box->width, box->height,
1110 box->depth);
1111 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1112 "%u (box.x)\n", item->id, box->x);
1113
1114
1115 assert(resource->target == PIPE_BUFFER);
1116 assert(resource->bind & PIPE_BIND_GLOBAL);
1117 assert(box->x >= 0);
1118 assert(box->y == 0);
1119 assert(box->z == 0);
1120
1121 ///TODO: do it better, mapping is not possible if the pool is too big
1122 return pipe_buffer_map_range(ctx, dst,
1123 offset, box->width, usage, ptransfer);
1124 }
1125
1126 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1127 struct pipe_transfer *transfer)
1128 {
1129 /* struct r600_resource_global are not real resources, they just map
1130 * to an offset within the compute memory pool. The function
1131 * r600_compute_global_transfer_map() maps the memory pool
1132 * resource rather than the struct r600_resource_global passed to
1133 * it as an argument and then initalizes ptransfer->resource with
1134 * the memory pool resource (via pipe_buffer_map_range).
1135 * When transfer_unmap is called it uses the memory pool's
1136 * vtable which calls r600_buffer_transfer_map() rather than
1137 * this function.
1138 */
1139 assert (!"This function should not be called");
1140 }
1141
1142 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1143 struct pipe_transfer *transfer,
1144 const struct pipe_box *box)
1145 {
1146 assert(0 && "TODO");
1147 }
1148
1149 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1150 struct pipe_resource *res)
1151 {
1152 struct r600_resource_global* buffer = NULL;
1153 struct r600_screen* rscreen = NULL;
1154
1155 assert(res->target == PIPE_BUFFER);
1156 assert(res->bind & PIPE_BIND_GLOBAL);
1157
1158 buffer = (struct r600_resource_global*)res;
1159 rscreen = (struct r600_screen*)screen;
1160
1161 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1162
1163 buffer->chunk = NULL;
1164 free(res);
1165 }
1166
1167 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1168 {
1169 u_default_resource_get_handle, /* get_handle */
1170 r600_compute_global_buffer_destroy, /* resource_destroy */
1171 r600_compute_global_transfer_map, /* transfer_map */
1172 r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1173 r600_compute_global_transfer_unmap, /* transfer_unmap */
1174 };
1175
1176 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1177 const struct pipe_resource *templ)
1178 {
1179 struct r600_resource_global* result = NULL;
1180 struct r600_screen* rscreen = NULL;
1181 int size_in_dw = 0;
1182
1183 assert(templ->target == PIPE_BUFFER);
1184 assert(templ->bind & PIPE_BIND_GLOBAL);
1185 assert(templ->array_size == 1 || templ->array_size == 0);
1186 assert(templ->depth0 == 1 || templ->depth0 == 0);
1187 assert(templ->height0 == 1 || templ->height0 == 0);
1188
1189 result = (struct r600_resource_global*)
1190 CALLOC(sizeof(struct r600_resource_global), 1);
1191 rscreen = (struct r600_screen*)screen;
1192
1193 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1194 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1195 templ->array_size);
1196
1197 result->base.b.vtbl = &r600_global_buffer_vtbl;
1198 result->base.b.b = *templ;
1199 result->base.b.b.screen = screen;
1200 pipe_reference_init(&result->base.b.b.reference, 1);
1201
1202 size_in_dw = (templ->width0+3) / 4;
1203
1204 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1205
1206 if (result->chunk == NULL)
1207 {
1208 free(result);
1209 return NULL;
1210 }
1211
1212 return &result->base.b.b;
1213 }