src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <gelf.h>
  28 #include <libelf.h>
  29 #include <stdio.h>
  30 #include <errno.h>
  31 #include "pipe/p_defines.h"
  32 #include "pipe/p_state.h"
  33 #include "pipe/p_context.h"
  34 #include "util/u_blitter.h"
  35 #include "util/list.h"
  36 #include "util/u_transfer.h"
  37 #include "util/u_surface.h"
  38 #include "util/u_pack_color.h"
  39 #include "util/u_memory.h"
  40 #include "util/u_inlines.h"
  41 #include "util/u_framebuffer.h"
  42 #include "pipebuffer/pb_buffer.h"
  43 #include "evergreend.h"
  44 #include "r600_shader.h"
  45 #include "r600_pipe.h"
  46 #include "r600_formats.h"
  47 #include "evergreen_compute.h"
  48 #include "evergreen_compute_internal.h"
  49 #include "compute_memory_pool.h"
  50 #include "sb/sb_public.h"
  51 #include <inttypes.h>
  52
  53 /**
  54 RAT0 is for global binding write
  55 VTX1 is for global binding read
  56
  57 for wrting images RAT1...
  58 for reading images TEX2...
  59   TEX2-RAT1 is paired
  60
  61 TEX2... consumes the same fetch resources, that VTX2... would consume
  62
  63 CONST0 and VTX0 is for parameters
  64   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  65   also constant cached
  66   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  67   the constant cache can handle
  68
  69 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  70 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  71 we should reserve another one too.=> 10 image binding for writing max.
  72
  73 from Nvidia OpenCL:
  74   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  75   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  76
  77 so 10 for writing is enough. 176 is the max for reading according to the docs
  78
  79 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  80 writable images will consume TEX slots, VTX slots too because of linear indexing
  81
  82 */
  83
  84 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  85                                                      unsigned size)
  86 {
  87         struct pipe_resource *buffer = NULL;
  88         assert(size);
  89
  90         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  91                                     0, PIPE_USAGE_IMMUTABLE, size);
  92
  93         return (struct r600_resource *)buffer;
  94 }
  95
  96
  97 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
  98                               unsigned id,
  99                               struct r600_resource *bo,
 100                               int start,
 101                               int size)
 102 {
 103         struct pipe_surface rat_templ;
 104         struct r600_surface *surf = NULL;
 105         struct r600_context *rctx = NULL;
 106
 107         assert(id < 12);
 108         assert((size & 3) == 0);
 109         assert((start & 0xFF) == 0);
 110
 111         rctx = pipe->ctx;
 112
 113         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 114
 115         /* Create the RAT surface */
 116         memset(&rat_templ, 0, sizeof(rat_templ));
 117         rat_templ.format = PIPE_FORMAT_R32_UINT;
 118         rat_templ.u.tex.level = 0;
 119         rat_templ.u.tex.first_layer = 0;
 120         rat_templ.u.tex.last_layer = 0;
 121
 122         /* Add the RAT the list of color buffers */
 123         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 124                 (struct pipe_context *)pipe->ctx,
 125                 (struct pipe_resource *)bo, &rat_templ);
 126
 127         /* Update the number of color buffers */
 128         pipe->ctx->framebuffer.state.nr_cbufs =
 129                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 130
 131         /* Update the cb_target_mask
 132          * XXX: I think this is a potential spot for bugs once we start doing
 133          * GL interop.  cb_target_mask may be modified in the 3D sections
 134          * of this driver. */
 135         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 136
 137         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 138         evergreen_init_color_surface_rat(rctx, surf);
 139 }
 140
 141 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 142                                            unsigned vb_index,
 143                                            unsigned offset,
 144                                            struct pipe_resource *buffer)
 145 {
 146         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 147         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 148         vb->stride = 1;
 149         vb->buffer_offset = offset;
 150         vb->buffer.resource = buffer;
 151         vb->is_user_buffer = false;
 152
 153         /* The vertex instructions in the compute shaders use the texture cache,
 154          * so we need to invalidate it. */
 155         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 156         state->enabled_mask |= 1 << vb_index;
 157         state->dirty_mask |= 1 << vb_index;
 158         r600_mark_atom_dirty(rctx, &state->atom);
 159 }
 160
 161 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 162                                              unsigned cb_index,
 163                                              unsigned offset,
 164                                              unsigned size,
 165                                              struct pipe_resource *buffer)
 166 {
 167         struct pipe_constant_buffer cb;
 168         cb.buffer_size = size;
 169         cb.buffer_offset = offset;
 170         cb.buffer = buffer;
 171         cb.user_buffer = NULL;
 172
 173         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 174 }
 175
 176 /* We need to define these R600 registers here, because we can't include
 177  * evergreend.h and r600d.h.
 178  */
 179 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 180 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 181
 182 #ifdef HAVE_OPENCL
 183 static void parse_symbol_table(Elf_Data *symbol_table_data,
 184                                 const GElf_Shdr *symbol_table_header,
 185                                 struct ac_shader_binary *binary)
 186 {
 187         GElf_Sym symbol;
 188         unsigned i = 0;
 189         unsigned symbol_count =
 190                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 191
 192         /* We are over allocating this list, because symbol_count gives the
 193          * total number of symbols, and we will only be filling the list
 194          * with offsets of global symbols.  The memory savings from
 195          * allocating the correct size of this list will be small, and
 196          * I don't think it is worth the cost of pre-computing the number
 197          * of global symbols.
 198          */
 199         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 200
 201         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 202                 unsigned i;
 203                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 204                     symbol.st_shndx == 0 /* Undefined symbol */) {
 205                         continue;
 206                 }
 207
 208                 binary->global_symbol_offsets[binary->global_symbol_count] =
 209                                         symbol.st_value;
 210
 211                 /* Sort the list using bubble sort.  This list will usually
 212                  * be small. */
 213                 for (i = binary->global_symbol_count; i > 0; --i) {
 214                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 215                         uint64_t rhs = binary->global_symbol_offsets[i];
 216                         if (lhs < rhs) {
 217                                 break;
 218                         }
 219                         binary->global_symbol_offsets[i] = lhs;
 220                         binary->global_symbol_offsets[i - 1] = rhs;
 221                 }
 222                 ++binary->global_symbol_count;
 223         }
 224 }
 225
 226
 227 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 228                         unsigned symbol_sh_link,
 229                         struct ac_shader_binary *binary)
 230 {
 231         unsigned i;
 232
 233         if (!relocs || !symbols || !binary->reloc_count) {
 234                 return;
 235         }
 236         binary->relocs = CALLOC(binary->reloc_count,
 237                         sizeof(struct ac_shader_reloc));
 238         for (i = 0; i < binary->reloc_count; i++) {
 239                 GElf_Sym symbol;
 240                 GElf_Rel rel;
 241                 char *symbol_name;
 242                 struct ac_shader_reloc *reloc = &binary->relocs[i];
 243
 244                 gelf_getrel(relocs, i, &rel);
 245                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 246                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 247
 248                 reloc->offset = rel.r_offset;
 249                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 250                 reloc->name[sizeof(reloc->name)-1] = 0;
 251         }
 252 }
 253
 254 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 255                  struct ac_shader_binary *binary)
 256 {
 257         char *elf_buffer;
 258         Elf *elf;
 259         Elf_Scn *section = NULL;
 260         Elf_Data *symbols = NULL, *relocs = NULL;
 261         size_t section_str_index;
 262         unsigned symbol_sh_link = 0;
 263
 264         /* One of the libelf implementations
 265          * (http://www.mr511.de/software/english.htm) requires calling
 266          * elf_version() before elf_memory().
 267          */
 268         elf_version(EV_CURRENT);
 269         elf_buffer = MALLOC(elf_size);
 270         memcpy(elf_buffer, elf_data, elf_size);
 271
 272         elf = elf_memory(elf_buffer, elf_size);
 273
 274         elf_getshdrstrndx(elf, &section_str_index);
 275
 276         while ((section = elf_nextscn(elf, section))) {
 277                 const char *name;
 278                 Elf_Data *section_data = NULL;
 279                 GElf_Shdr section_header;
 280                 if (gelf_getshdr(section, &section_header) != &section_header) {
 281                         fprintf(stderr, "Failed to read ELF section header\n");
 282                         return;
 283                 }
 284                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 285                 if (!strcmp(name, ".text")) {
 286                         section_data = elf_getdata(section, section_data);
 287                         binary->code_size = section_data->d_size;
 288                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 289                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 290                 } else if (!strcmp(name, ".AMDGPU.config")) {
 291                         section_data = elf_getdata(section, section_data);
 292                         binary->config_size = section_data->d_size;
 293                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 294                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 295                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 296                         /* Always read disassembly if it's available. */
 297                         section_data = elf_getdata(section, section_data);
 298                         binary->disasm_string = strndup(section_data->d_buf,
 299                                                         section_data->d_size);
 300                 } else if (!strncmp(name, ".rodata", 7)) {
 301                         section_data = elf_getdata(section, section_data);
 302                         binary->rodata_size = section_data->d_size;
 303                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 304                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 305                 } else if (!strncmp(name, ".symtab", 7)) {
 306                         symbols = elf_getdata(section, section_data);
 307                         symbol_sh_link = section_header.sh_link;
 308                         parse_symbol_table(symbols, &section_header, binary);
 309                 } else if (!strcmp(name, ".rel.text")) {
 310                         relocs = elf_getdata(section, section_data);
 311                         binary->reloc_count = section_header.sh_size /
 312                                         section_header.sh_entsize;
 313                 }
 314         }
 315
 316         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 317
 318         if (elf){
 319                 elf_end(elf);
 320         }
 321         FREE(elf_buffer);
 322
 323         /* Cache the config size per symbol */
 324         if (binary->global_symbol_count) {
 325                 binary->config_size_per_symbol =
 326                         binary->config_size / binary->global_symbol_count;
 327         } else {
 328                 binary->global_symbol_count = 1;
 329                 binary->config_size_per_symbol = binary->config_size;
 330         }
 331 }
 332
 333 static const unsigned char *r600_shader_binary_config_start(
 334         const struct ac_shader_binary *binary,
 335         uint64_t symbol_offset)
 336 {
 337         unsigned i;
 338         for (i = 0; i < binary->global_symbol_count; ++i) {
 339                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 340                         unsigned offset = i * binary->config_size_per_symbol;
 341                         return binary->config + offset;
 342                 }
 343         }
 344         return binary->config;
 345 }
 346
 347 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 348                                            struct r600_bytecode *bc,
 349                                            uint64_t symbol_offset,
 350                                            boolean *use_kill)
 351 {
 352        unsigned i;
 353        const unsigned char *config =
 354                r600_shader_binary_config_start(binary, symbol_offset);
 355
 356        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 357                unsigned reg =
 358                        util_le32_to_cpu(*(uint32_t*)(config + i));
 359                unsigned value =
 360                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 361                switch (reg) {
 362                /* R600 / R700 */
 363                case R_028850_SQ_PGM_RESOURCES_PS:
 364                case R_028868_SQ_PGM_RESOURCES_VS:
 365                /* Evergreen / Northern Islands */
 366                case R_028844_SQ_PGM_RESOURCES_PS:
 367                case R_028860_SQ_PGM_RESOURCES_VS:
 368                case R_0288D4_SQ_PGM_RESOURCES_LS:
 369                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 370                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 371                        break;
 372                case R_02880C_DB_SHADER_CONTROL:
 373                        *use_kill = G_02880C_KILL_ENABLE(value);
 374                        break;
 375                case R_0288E8_SQ_LDS_ALLOC:
 376                        bc->nlds_dw = value;
 377                        break;
 378                }
 379        }
 380 }
 381
 382 static unsigned r600_create_shader(struct r600_bytecode *bc,
 383                                    const struct ac_shader_binary *binary,
 384                                    boolean *use_kill)
 385
 386 {
 387         assert(binary->code_size % 4 == 0);
 388         bc->bytecode = CALLOC(1, binary->code_size);
 389         memcpy(bc->bytecode, binary->code, binary->code_size);
 390         bc->ndw = binary->code_size / 4;
 391
 392         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 393         return 0;
 394 }
 395
 396 #endif
 397
 398 static void r600_destroy_shader(struct r600_bytecode *bc)
 399 {
 400         FREE(bc->bytecode);
 401 }
 402
 403 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 404                                             const struct pipe_compute_state *cso)
 405 {
 406         struct r600_context *rctx = (struct r600_context *)ctx;
 407         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 408 #ifdef HAVE_OPENCL
 409         const struct pipe_llvm_program_header *header;
 410         const char *code;
 411         void *p;
 412         boolean use_kill;
 413
 414         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 415         header = cso->prog;
 416         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 417         radeon_shader_binary_init(&shader->binary);
 418         r600_elf_read(code, header->num_bytes, &shader->binary);
 419         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 420
 421         /* Upload code + ROdata */
 422         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 423                                                         shader->bc.ndw * 4);
 424         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 425         //TODO: use util_memcpy_cpu_to_le32 ?
 426         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 427         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 428 #endif
 429
 430         shader->ctx = rctx;
 431         shader->local_size = cso->req_local_mem;
 432         shader->private_size = cso->req_private_mem;
 433         shader->input_size = cso->req_input_mem;
 434
 435         return shader;
 436 }
 437
 438 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 439 {
 440         struct r600_context *rctx = (struct r600_context *)ctx;
 441         struct r600_pipe_compute *shader = state;
 442
 443         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 444
 445         if (!shader)
 446                 return;
 447
 448 #ifdef HAVE_OPENCL
 449         radeon_shader_binary_clean(&shader->binary);
 450 #endif
 451         r600_destroy_shader(&shader->bc);
 452
 453         /* TODO destroy shader->code_bo, shader->const_bo
 454          * we'll need something like r600_buffer_free */
 455         FREE(shader);
 456 }
 457
 458 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 459 {
 460         struct r600_context *rctx = (struct r600_context *)ctx;
 461
 462         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 463
 464         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 465 }
 466
 467 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 468  * kernel parameters there are implicit parameters that need to be stored
 469  * in the vertex buffer as well.  Here is how these parameters are organized in
 470  * the buffer:
 471  *
 472  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 473  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 474  * DWORDS 6-8: Number of work items within each work group in each dimension
 475  *             (x,y,z)
 476  * DWORDS 9+ : Kernel parameters
 477  */
 478 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 479                                            const struct pipe_grid_info *info)
 480 {
 481         struct r600_context *rctx = (struct r600_context *)ctx;
 482         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 483         unsigned i;
 484         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 485          * parameters.
 486          */
 487         unsigned input_size = shader->input_size + 36;
 488         uint32_t *num_work_groups_start;
 489         uint32_t *global_size_start;
 490         uint32_t *local_size_start;
 491         uint32_t *kernel_parameters_start;
 492         struct pipe_box box;
 493         struct pipe_transfer *transfer = NULL;
 494
 495         if (shader->input_size == 0) {
 496                 return;
 497         }
 498
 499         if (!shader->kernel_param) {
 500                 /* Add space for the grid dimensions */
 501                 shader->kernel_param = (struct r600_resource *)
 502                         pipe_buffer_create(ctx->screen, 0,
 503                                         PIPE_USAGE_IMMUTABLE, input_size);
 504         }
 505
 506         u_box_1d(0, input_size, &box);
 507         num_work_groups_start = ctx->transfer_map(ctx,
 508                         (struct pipe_resource*)shader->kernel_param,
 509                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 510                         &box, &transfer);
 511         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 512         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 513         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 514
 515         /* Copy the work group size */
 516         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 517
 518         /* Copy the global size */
 519         for (i = 0; i < 3; i++) {
 520                 global_size_start[i] = info->grid[i] * info->block[i];
 521         }
 522
 523         /* Copy the local dimensions */
 524         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 525
 526         /* Copy the kernel inputs */
 527         memcpy(kernel_parameters_start, info->input, shader->input_size);
 528
 529         for (i = 0; i < (input_size / 4); i++) {
 530                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 531                         ((unsigned*)num_work_groups_start)[i]);
 532         }
 533
 534         ctx->transfer_unmap(ctx, transfer);
 535
 536         /* ID=0 and ID=3 are reserved for the parameters.
 537          * LLVM will preferably use ID=0, but it does not work for dynamic
 538          * indices. */
 539         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 540                         (struct pipe_resource*)shader->kernel_param);
 541         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 542                         (struct pipe_resource*)shader->kernel_param);
 543 }
 544
 545 static void evergreen_emit_dispatch(struct r600_context *rctx,
 546                                     const struct pipe_grid_info *info)
 547 {
 548         int i;
 549         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 550         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 551         unsigned num_waves;
 552         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 553         unsigned wave_divisor = (16 * num_pipes);
 554         int group_size = 1;
 555         int grid_size = 1;
 556         unsigned lds_size = shader->local_size / 4 +
 557                 shader->bc.nlds_dw;
 558
 559
 560         /* Calculate group_size/grid_size */
 561         for (i = 0; i < 3; i++) {
 562                 group_size *= info->block[i];
 563         }
 564
 565         for (i = 0; i < 3; i++) {
 566                 grid_size *= info->grid[i];
 567         }
 568
 569         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 570         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 571                         wave_divisor - 1) / wave_divisor;
 572
 573         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 574                                 "%u wavefronts per thread block, "
 575                                 "allocating %u dwords lds.\n",
 576                                 num_pipes, num_waves, lds_size);
 577
 578         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 579
 580         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 581         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 582         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 583         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 584
 585         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 586                                                                 group_size);
 587
 588         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 589         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 590         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 591         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 592
 593         if (rctx->b.chip_class < CAYMAN) {
 594                 assert(lds_size <= 8192);
 595         } else {
 596                 /* Cayman appears to have a slightly smaller limit, see the
 597                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 598                 assert(lds_size <= 8160);
 599         }
 600
 601         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 602                                         lds_size | (num_waves << 14));
 603
 604         /* Dispatch packet */
 605         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 606         radeon_emit(cs, info->grid[0]);
 607         radeon_emit(cs, info->grid[1]);
 608         radeon_emit(cs, info->grid[2]);
 609         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 610         radeon_emit(cs, 1);
 611
 612         if (rctx->is_debug)
 613                 eg_trace_emit(rctx);
 614 }
 615
 616 static void compute_emit_cs(struct r600_context *rctx,
 617                             const struct pipe_grid_info *info)
 618 {
 619         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 620         unsigned i;
 621
 622         /* make sure that the gfx ring is only one active */
 623         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 624                 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 625         }
 626
 627         /* Initialize all the compute-related registers.
 628          *
 629          * See evergreen_init_atom_start_compute_cs() in this file for the list
 630          * of registers initialized by the start_compute_cs_cmd atom.
 631          */
 632         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 633
 634         /* emit config state */
 635         if (rctx->b.chip_class == EVERGREEN)
 636                 r600_emit_atom(rctx, &rctx->config_state.atom);
 637
 638         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 639         r600_flush_emit(rctx);
 640
 641         /* Emit colorbuffers. */
 642         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 643         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 644                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 645                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 646                                                        (struct r600_resource*)cb->base.texture,
 647                                                        RADEON_USAGE_READWRITE,
 648                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 649
 650                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 651                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 652                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 653                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 654                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 655                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 656                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 657                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 658
 659                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 660                 radeon_emit(cs, reloc);
 661
 662                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 663                 radeon_emit(cs, reloc);
 664         }
 665         for (; i < 8 ; i++)
 666                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 667                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 668         for (; i < 12; i++)
 669                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 670                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 671
 672         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 673         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 674                                         rctx->compute_cb_target_mask);
 675
 676
 677         /* Emit vertex buffer state */
 678         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 679         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 680
 681         /* Emit constant buffer state */
 682         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 683
 684         /* Emit sampler state */
 685         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 686
 687         /* Emit sampler view (texture resource) state */
 688         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 689
 690         /* Emit compute shader state */
 691         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 692
 693         /* Emit dispatch state and dispatch packet */
 694         evergreen_emit_dispatch(rctx, info);
 695
 696         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 697          */
 698         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 699                       R600_CONTEXT_INV_VERTEX_CACHE |
 700                       R600_CONTEXT_INV_TEX_CACHE;
 701         r600_flush_emit(rctx);
 702         rctx->b.flags = 0;
 703
 704         if (rctx->b.chip_class >= CAYMAN) {
 705                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 706                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 707                 /* DEALLOC_STATE prevents the GPU from hanging when a
 708                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 709                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 710                  */
 711                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 712                 radeon_emit(cs, 0);
 713         }
 714
 715 #if 0
 716         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 717         for (i = 0; i < cs->cdw; i++) {
 718                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 719         }
 720 #endif
 721
 722 }
 723
 724
 725 /**
 726  * Emit function for r600_cs_shader_state atom
 727  */
 728 void evergreen_emit_cs_shader(struct r600_context *rctx,
 729                               struct r600_atom *atom)
 730 {
 731         struct r600_cs_shader_state *state =
 732                                         (struct r600_cs_shader_state*)atom;
 733         struct r600_pipe_compute *shader = state->shader;
 734         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 735         uint64_t va;
 736         struct r600_resource *code_bo;
 737         unsigned ngpr, nstack;
 738
 739         code_bo = shader->code_bo;
 740         va = shader->code_bo->gpu_address + state->pc;
 741         ngpr = shader->bc.ngpr;
 742         nstack = shader->bc.nstack;
 743
 744         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 745         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 746         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 747                         S_0288D4_NUM_GPRS(ngpr)
 748                         | S_0288D4_STACK_SIZE(nstack));
 749         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 750
 751         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 752         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 753                                               code_bo, RADEON_USAGE_READ,
 754                                               RADEON_PRIO_SHADER_BINARY));
 755 }
 756
 757 static void evergreen_launch_grid(struct pipe_context *ctx,
 758                                   const struct pipe_grid_info *info)
 759 {
 760         struct r600_context *rctx = (struct r600_context *)ctx;
 761 #ifdef HAVE_OPENCL
 762         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 763         boolean use_kill;
 764
 765         rctx->cs_shader_state.pc = info->pc;
 766         /* Get the config information for this kernel. */
 767         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 768                                   info->pc, &use_kill);
 769 #endif
 770
 771         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 772
 773
 774         evergreen_compute_upload_input(ctx, info);
 775         compute_emit_cs(rctx, info);
 776 }
 777
 778 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 779                                             unsigned start, unsigned count,
 780                                             struct pipe_surface **surfaces)
 781 {
 782         struct r600_context *rctx = (struct r600_context *)ctx;
 783         struct r600_surface **resources = (struct r600_surface **)surfaces;
 784
 785         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 786                         start, count);
 787
 788         for (unsigned i = 0; i < count; i++) {
 789                 /* The First four vertex buffers are reserved for parameters and
 790                  * global buffers. */
 791                 unsigned vtx_id = 4 + i;
 792                 if (resources[i]) {
 793                         struct r600_resource_global *buffer =
 794                                 (struct r600_resource_global*)
 795                                 resources[i]->base.texture;
 796                         if (resources[i]->base.writable) {
 797                                 assert(i+1 < 12);
 798
 799                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 800                                 (struct r600_resource *)resources[i]->base.texture,
 801                                 buffer->chunk->start_in_dw*4,
 802                                 resources[i]->base.texture->width0);
 803                         }
 804
 805                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 806                                         buffer->chunk->start_in_dw * 4,
 807                                         resources[i]->base.texture);
 808                 }
 809         }
 810 }
 811
 812 static void evergreen_set_global_binding(struct pipe_context *ctx,
 813                                          unsigned first, unsigned n,
 814                                          struct pipe_resource **resources,
 815                                          uint32_t **handles)
 816 {
 817         struct r600_context *rctx = (struct r600_context *)ctx;
 818         struct compute_memory_pool *pool = rctx->screen->global_pool;
 819         struct r600_resource_global **buffers =
 820                 (struct r600_resource_global **)resources;
 821         unsigned i;
 822
 823         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 824                         first, n);
 825
 826         if (!resources) {
 827                 /* XXX: Unset */
 828                 return;
 829         }
 830
 831         /* We mark these items for promotion to the pool if they
 832          * aren't already there */
 833         for (i = first; i < first + n; i++) {
 834                 struct compute_memory_item *item = buffers[i]->chunk;
 835
 836                 if (!is_item_in_pool(item))
 837                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 838         }
 839
 840         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 841                 /* XXX: Unset */
 842                 return;
 843         }
 844
 845         for (i = first; i < first + n; i++)
 846         {
 847                 uint32_t buffer_offset;
 848                 uint32_t handle;
 849                 assert(resources[i]->target == PIPE_BUFFER);
 850                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 851
 852                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 853                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 854
 855                 *(handles[i]) = util_cpu_to_le32(handle);
 856         }
 857
 858         /* globals for writing */
 859         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 860         /* globals for reading */
 861         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 862                                 (struct pipe_resource*)pool->bo);
 863
 864         /* constants for reading, LLVM puts them in text segment */
 865         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 866                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 867 }
 868
 869 /**
 870  * This function initializes all the compute specific registers that need to
 871  * be initialized for each compute command stream.  Registers that are common
 872  * to both compute and 3D will be initialized at the beginning of each compute
 873  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 874  * packet requires that the shader type bit be set, we must initialize all
 875  * context registers needed for compute in this function.  The registers
 876  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 877  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 878  * on the GPU family.
 879  */
 880 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 881 {
 882         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 883         int num_threads;
 884         int num_stack_entries;
 885
 886         /* since all required registers are initialized in the
 887          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 888          */
 889         r600_init_command_buffer(cb, 256);
 890         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 891
 892         /* This must be first. */
 893         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 894         r600_store_value(cb, 0x80000000);
 895         r600_store_value(cb, 0x80000000);
 896
 897         /* We're setting config registers here. */
 898         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 899         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 900
 901         switch (rctx->b.family) {
 902         case CHIP_CEDAR:
 903         default:
 904                 num_threads = 128;
 905                 num_stack_entries = 256;
 906                 break;
 907         case CHIP_REDWOOD:
 908                 num_threads = 128;
 909                 num_stack_entries = 256;
 910                 break;
 911         case CHIP_JUNIPER:
 912                 num_threads = 128;
 913                 num_stack_entries = 512;
 914                 break;
 915         case CHIP_CYPRESS:
 916         case CHIP_HEMLOCK:
 917                 num_threads = 128;
 918                 num_stack_entries = 512;
 919                 break;
 920         case CHIP_PALM:
 921                 num_threads = 128;
 922                 num_stack_entries = 256;
 923                 break;
 924         case CHIP_SUMO:
 925                 num_threads = 128;
 926                 num_stack_entries = 256;
 927                 break;
 928         case CHIP_SUMO2:
 929                 num_threads = 128;
 930                 num_stack_entries = 512;
 931                 break;
 932         case CHIP_BARTS:
 933                 num_threads = 128;
 934                 num_stack_entries = 512;
 935                 break;
 936         case CHIP_TURKS:
 937                 num_threads = 128;
 938                 num_stack_entries = 256;
 939                 break;
 940         case CHIP_CAICOS:
 941                 num_threads = 128;
 942                 num_stack_entries = 256;
 943                 break;
 944         }
 945
 946         /* Config Registers */
 947         if (rctx->b.chip_class < CAYMAN)
 948                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 949                                            rctx->screen->b.info.drm_minor);
 950         else
 951                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 952                                         rctx->screen->b.info.drm_minor);
 953
 954         /* The primitive type always needs to be POINTLIST for compute. */
 955         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 956                                                 V_008958_DI_PT_POINTLIST);
 957
 958         if (rctx->b.chip_class < CAYMAN) {
 959
 960                 /* These registers control which simds can be used by each stage.
 961                  * The default for these registers is 0xffffffff, which means
 962                  * all simds are available for each stage.  It's possible we may
 963                  * want to play around with these in the future, but for now
 964                  * the default value is fine.
 965                  *
 966                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 967                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 968                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 969                  */
 970
 971                 /* XXX: We may need to adjust the thread and stack resource
 972                  * values for 3D/compute interop */
 973
 974                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 975
 976                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 977                  * Set the number of threads used by the PS/VS/GS/ES stage to
 978                  * 0.
 979                  */
 980                 r600_store_value(cb, 0);
 981
 982                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 983                  * Set the number of threads used by the CS (aka LS) stage to
 984                  * the maximum number of threads and set the number of threads
 985                  * for the HS stage to 0. */
 986                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 987
 988                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 989                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 990                 r600_store_value(cb, 0);
 991
 992                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 993                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 994                 r600_store_value(cb, 0);
 995
 996                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 997                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 998                  * set it to the maximum value for the CS (aka LS) stage. */
 999                 r600_store_value(cb,
1000                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1001         }
1002         /* Give the compute shader all the available LDS space.
1003          * NOTE: This only sets the maximum number of dwords that a compute
1004          * shader can allocate.  When a shader is executed, we still need to
1005          * allocate the appropriate amount of LDS dwords using the
1006          * CM_R_0288E8_SQ_LDS_ALLOC register.
1007          */
1008         if (rctx->b.chip_class < CAYMAN) {
1009                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1010                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1011         } else {
1012                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1013                         S_0286FC_NUM_PS_LDS(0) |
1014                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1015         }
1016
1017         /* Context Registers */
1018
1019         if (rctx->b.chip_class < CAYMAN) {
1020                 /* workaround for hw issues with dyn gpr - must set all limits
1021                  * to 240 instead of 0, 0x1e == 240 / 8
1022                  */
1023                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1024                                 S_028838_PS_GPRS(0x1e) |
1025                                 S_028838_VS_GPRS(0x1e) |
1026                                 S_028838_GS_GPRS(0x1e) |
1027                                 S_028838_ES_GPRS(0x1e) |
1028                                 S_028838_HS_GPRS(0x1e) |
1029                                 S_028838_LS_GPRS(0x1e));
1030         }
1031
1032         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1033         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1034                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1035
1036         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1037
1038         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1039                                S_0286E8_TID_IN_GROUP_ENA(1) |
1040                                S_0286E8_TGID_ENA(1) |
1041                                S_0286E8_DISABLE_INDEX_PACK(1));
1042
1043         /* The LOOP_CONST registers are an optimizations for loops that allows
1044          * you to store the initial counter, increment value, and maximum
1045          * counter value in a register so that hardware can calculate the
1046          * correct number of iterations for the loop, so that you don't need
1047          * to have the loop counter in your shader code.  We don't currently use
1048          * this optimization, so we must keep track of the counter in the
1049          * shader and use a break instruction to exit loops.  However, the
1050          * hardware will still uses this register to determine when to exit a
1051          * loop, so we need to initialize the counter to 0, set the increment
1052          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1053          * is the maximum value allowed.  This gives us a maximum of 4096
1054          * iterations for our loops, but hopefully our break instruction will
1055          * execute before some time before the 4096th iteration.
1056          */
1057         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1058 }
1059
1060 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1061 {
1062         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1063         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1064         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1065 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1066         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1067         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1068         rctx->b.b.launch_grid = evergreen_launch_grid;
1069
1070 }
1071
1072 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1073                                               struct pipe_resource *resource,
1074                                               unsigned level,
1075                                               unsigned usage,
1076                                               const struct pipe_box *box,
1077                                               struct pipe_transfer **ptransfer)
1078 {
1079         struct r600_context *rctx = (struct r600_context*)ctx;
1080         struct compute_memory_pool *pool = rctx->screen->global_pool;
1081         struct r600_resource_global* buffer =
1082                 (struct r600_resource_global*)resource;
1083
1084         struct compute_memory_item *item = buffer->chunk;
1085         struct pipe_resource *dst = NULL;
1086         unsigned offset = box->x;
1087
1088         if (is_item_in_pool(item)) {
1089                 compute_memory_demote_item(pool, item, ctx);
1090         }
1091         else {
1092                 if (item->real_buffer == NULL) {
1093                         item->real_buffer =
1094                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1095                 }
1096         }
1097
1098         dst = (struct pipe_resource*)item->real_buffer;
1099
1100         if (usage & PIPE_TRANSFER_READ)
1101                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1102
1103         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1104                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1105                         "width = %u, height = %u, depth = %u)\n", level, usage,
1106                         box->x, box->y, box->z, box->width, box->height,
1107                         box->depth);
1108         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1109                 "%u (box.x)\n", item->id, box->x);
1110
1111
1112         assert(resource->target == PIPE_BUFFER);
1113         assert(resource->bind & PIPE_BIND_GLOBAL);
1114         assert(box->x >= 0);
1115         assert(box->y == 0);
1116         assert(box->z == 0);
1117
1118         ///TODO: do it better, mapping is not possible if the pool is too big
1119         return pipe_buffer_map_range(ctx, dst,
1120                         offset, box->width, usage, ptransfer);
1121 }
1122
1123 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1124                                                struct pipe_transfer *transfer)
1125 {
1126         /* struct r600_resource_global are not real resources, they just map
1127          * to an offset within the compute memory pool.  The function
1128          * r600_compute_global_transfer_map() maps the memory pool
1129          * resource rather than the struct r600_resource_global passed to
1130          * it as an argument and then initalizes ptransfer->resource with
1131          * the memory pool resource (via pipe_buffer_map_range).
1132          * When transfer_unmap is called it uses the memory pool's
1133          * vtable which calls r600_buffer_transfer_map() rather than
1134          * this function.
1135          */
1136         assert (!"This function should not be called");
1137 }
1138
1139 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1140                                                       struct pipe_transfer *transfer,
1141                                                       const struct pipe_box *box)
1142 {
1143         assert(0 && "TODO");
1144 }
1145
1146 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1147                                                struct pipe_resource *res)
1148 {
1149         struct r600_resource_global* buffer = NULL;
1150         struct r600_screen* rscreen = NULL;
1151
1152         assert(res->target == PIPE_BUFFER);
1153         assert(res->bind & PIPE_BIND_GLOBAL);
1154
1155         buffer = (struct r600_resource_global*)res;
1156         rscreen = (struct r600_screen*)screen;
1157
1158         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1159
1160         buffer->chunk = NULL;
1161         free(res);
1162 }
1163
1164 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1165 {
1166         u_default_resource_get_handle, /* get_handle */
1167         r600_compute_global_buffer_destroy, /* resource_destroy */
1168         r600_compute_global_transfer_map, /* transfer_map */
1169         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1170         r600_compute_global_transfer_unmap, /* transfer_unmap */
1171 };
1172
1173 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1174                                                         const struct pipe_resource *templ)
1175 {
1176         struct r600_resource_global* result = NULL;
1177         struct r600_screen* rscreen = NULL;
1178         int size_in_dw = 0;
1179
1180         assert(templ->target == PIPE_BUFFER);
1181         assert(templ->bind & PIPE_BIND_GLOBAL);
1182         assert(templ->array_size == 1 || templ->array_size == 0);
1183         assert(templ->depth0 == 1 || templ->depth0 == 0);
1184         assert(templ->height0 == 1 || templ->height0 == 0);
1185
1186         result = (struct r600_resource_global*)
1187         CALLOC(sizeof(struct r600_resource_global), 1);
1188         rscreen = (struct r600_screen*)screen;
1189
1190         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1191         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1192                         templ->array_size);
1193
1194         result->base.b.vtbl = &r600_global_buffer_vtbl;
1195         result->base.b.b = *templ;
1196         result->base.b.b.screen = screen;
1197         pipe_reference_init(&result->base.b.b.reference, 1);
1198
1199         size_in_dw = (templ->width0+3) / 4;
1200
1201         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1202
1203         if (result->chunk == NULL)
1204         {
1205                 free(result);
1206                 return NULL;
1207         }
1208
1209         return &result->base.b.b;
1210 }