src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "tgsi/tgsi_parse.h"
  45 #include "pipebuffer/pb_buffer.h"
  46 #include "evergreend.h"
  47 #include "r600_shader.h"
  48 #include "r600_pipe.h"
  49 #include "r600_formats.h"
  50 #include "evergreen_compute.h"
  51 #include "evergreen_compute_internal.h"
  52 #include "compute_memory_pool.h"
  53 #include "sb/sb_public.h"
  54 #include <inttypes.h>
  55
  56 /**
  57 RAT0 is for global binding write
  58 VTX1 is for global binding read
  59
  60 for wrting images RAT1...
  61 for reading images TEX2...
  62   TEX2-RAT1 is paired
  63
  64 TEX2... consumes the same fetch resources, that VTX2... would consume
  65
  66 CONST0 and VTX0 is for parameters
  67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  68   also constant cached
  69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  70   the constant cache can handle
  71
  72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  74 we should reserve another one too.=> 10 image binding for writing max.
  75
  76 from Nvidia OpenCL:
  77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  79
  80 so 10 for writing is enough. 176 is the max for reading according to the docs
  81
  82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  83 writable images will consume TEX slots, VTX slots too because of linear indexing
  84
  85 */
  86
  87 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  88                                                      unsigned size)
  89 {
  90         struct pipe_resource *buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  94                                     0, PIPE_USAGE_IMMUTABLE, size);
  95
  96         return (struct r600_resource *)buffer;
  97 }
  98
  99
 100 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 101                               unsigned id,
 102                               struct r600_resource *bo,
 103                               int start,
 104                               int size)
 105 {
 106         struct pipe_surface rat_templ;
 107         struct r600_surface *surf = NULL;
 108         struct r600_context *rctx = NULL;
 109
 110         assert(id < 12);
 111         assert((size & 3) == 0);
 112         assert((start & 0xFF) == 0);
 113
 114         rctx = pipe->ctx;
 115
 116         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 117
 118         /* Create the RAT surface */
 119         memset(&rat_templ, 0, sizeof(rat_templ));
 120         rat_templ.format = PIPE_FORMAT_R32_UINT;
 121         rat_templ.u.tex.level = 0;
 122         rat_templ.u.tex.first_layer = 0;
 123         rat_templ.u.tex.last_layer = 0;
 124
 125         /* Add the RAT the list of color buffers */
 126         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 127                 (struct pipe_context *)pipe->ctx,
 128                 (struct pipe_resource *)bo, &rat_templ);
 129
 130         /* Update the number of color buffers */
 131         pipe->ctx->framebuffer.state.nr_cbufs =
 132                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 133
 134         /* Update the cb_target_mask
 135          * XXX: I think this is a potential spot for bugs once we start doing
 136          * GL interop.  cb_target_mask may be modified in the 3D sections
 137          * of this driver. */
 138         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 139
 140         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 141         evergreen_init_color_surface_rat(rctx, surf);
 142 }
 143
 144 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 145                                            unsigned vb_index,
 146                                            unsigned offset,
 147                                            struct pipe_resource *buffer)
 148 {
 149         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 150         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 151         vb->stride = 1;
 152         vb->buffer_offset = offset;
 153         vb->buffer.resource = buffer;
 154         vb->is_user_buffer = false;
 155
 156         /* The vertex instructions in the compute shaders use the texture cache,
 157          * so we need to invalidate it. */
 158         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 159         state->enabled_mask |= 1 << vb_index;
 160         state->dirty_mask |= 1 << vb_index;
 161         r600_mark_atom_dirty(rctx, &state->atom);
 162 }
 163
 164 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 165                                              unsigned cb_index,
 166                                              unsigned offset,
 167                                              unsigned size,
 168                                              struct pipe_resource *buffer)
 169 {
 170         struct pipe_constant_buffer cb;
 171         cb.buffer_size = size;
 172         cb.buffer_offset = offset;
 173         cb.buffer = buffer;
 174         cb.user_buffer = NULL;
 175
 176         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 177 }
 178
 179 /* We need to define these R600 registers here, because we can't include
 180  * evergreend.h and r600d.h.
 181  */
 182 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 183 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 184
 185 #ifdef HAVE_OPENCL
 186 static void parse_symbol_table(Elf_Data *symbol_table_data,
 187                                 const GElf_Shdr *symbol_table_header,
 188                                 struct ac_shader_binary *binary)
 189 {
 190         GElf_Sym symbol;
 191         unsigned i = 0;
 192         unsigned symbol_count =
 193                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 194
 195         /* We are over allocating this list, because symbol_count gives the
 196          * total number of symbols, and we will only be filling the list
 197          * with offsets of global symbols.  The memory savings from
 198          * allocating the correct size of this list will be small, and
 199          * I don't think it is worth the cost of pre-computing the number
 200          * of global symbols.
 201          */
 202         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 203
 204         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 205                 unsigned i;
 206                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 207                     symbol.st_shndx == 0 /* Undefined symbol */) {
 208                         continue;
 209                 }
 210
 211                 binary->global_symbol_offsets[binary->global_symbol_count] =
 212                                         symbol.st_value;
 213
 214                 /* Sort the list using bubble sort.  This list will usually
 215                  * be small. */
 216                 for (i = binary->global_symbol_count; i > 0; --i) {
 217                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 218                         uint64_t rhs = binary->global_symbol_offsets[i];
 219                         if (lhs < rhs) {
 220                                 break;
 221                         }
 222                         binary->global_symbol_offsets[i] = lhs;
 223                         binary->global_symbol_offsets[i - 1] = rhs;
 224                 }
 225                 ++binary->global_symbol_count;
 226         }
 227 }
 228
 229
 230 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 231                         unsigned symbol_sh_link,
 232                         struct ac_shader_binary *binary)
 233 {
 234         unsigned i;
 235
 236         if (!relocs || !symbols || !binary->reloc_count) {
 237                 return;
 238         }
 239         binary->relocs = CALLOC(binary->reloc_count,
 240                         sizeof(struct ac_shader_reloc));
 241         for (i = 0; i < binary->reloc_count; i++) {
 242                 GElf_Sym symbol;
 243                 GElf_Rel rel;
 244                 char *symbol_name;
 245                 struct ac_shader_reloc *reloc = &binary->relocs[i];
 246
 247                 gelf_getrel(relocs, i, &rel);
 248                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 249                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 250
 251                 reloc->offset = rel.r_offset;
 252                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 253                 reloc->name[sizeof(reloc->name)-1] = 0;
 254         }
 255 }
 256
 257 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 258                  struct ac_shader_binary *binary)
 259 {
 260         char *elf_buffer;
 261         Elf *elf;
 262         Elf_Scn *section = NULL;
 263         Elf_Data *symbols = NULL, *relocs = NULL;
 264         size_t section_str_index;
 265         unsigned symbol_sh_link = 0;
 266
 267         /* One of the libelf implementations
 268          * (http://www.mr511.de/software/english.htm) requires calling
 269          * elf_version() before elf_memory().
 270          */
 271         elf_version(EV_CURRENT);
 272         elf_buffer = MALLOC(elf_size);
 273         memcpy(elf_buffer, elf_data, elf_size);
 274
 275         elf = elf_memory(elf_buffer, elf_size);
 276
 277         elf_getshdrstrndx(elf, &section_str_index);
 278
 279         while ((section = elf_nextscn(elf, section))) {
 280                 const char *name;
 281                 Elf_Data *section_data = NULL;
 282                 GElf_Shdr section_header;
 283                 if (gelf_getshdr(section, &section_header) != &section_header) {
 284                         fprintf(stderr, "Failed to read ELF section header\n");
 285                         return;
 286                 }
 287                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 288                 if (!strcmp(name, ".text")) {
 289                         section_data = elf_getdata(section, section_data);
 290                         binary->code_size = section_data->d_size;
 291                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 292                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 293                 } else if (!strcmp(name, ".AMDGPU.config")) {
 294                         section_data = elf_getdata(section, section_data);
 295                         binary->config_size = section_data->d_size;
 296                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 297                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 298                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 299                         /* Always read disassembly if it's available. */
 300                         section_data = elf_getdata(section, section_data);
 301                         binary->disasm_string = strndup(section_data->d_buf,
 302                                                         section_data->d_size);
 303                 } else if (!strncmp(name, ".rodata", 7)) {
 304                         section_data = elf_getdata(section, section_data);
 305                         binary->rodata_size = section_data->d_size;
 306                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 307                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 308                 } else if (!strncmp(name, ".symtab", 7)) {
 309                         symbols = elf_getdata(section, section_data);
 310                         symbol_sh_link = section_header.sh_link;
 311                         parse_symbol_table(symbols, &section_header, binary);
 312                 } else if (!strcmp(name, ".rel.text")) {
 313                         relocs = elf_getdata(section, section_data);
 314                         binary->reloc_count = section_header.sh_size /
 315                                         section_header.sh_entsize;
 316                 }
 317         }
 318
 319         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 320
 321         if (elf){
 322                 elf_end(elf);
 323         }
 324         FREE(elf_buffer);
 325
 326         /* Cache the config size per symbol */
 327         if (binary->global_symbol_count) {
 328                 binary->config_size_per_symbol =
 329                         binary->config_size / binary->global_symbol_count;
 330         } else {
 331                 binary->global_symbol_count = 1;
 332                 binary->config_size_per_symbol = binary->config_size;
 333         }
 334 }
 335
 336 static const unsigned char *r600_shader_binary_config_start(
 337         const struct ac_shader_binary *binary,
 338         uint64_t symbol_offset)
 339 {
 340         unsigned i;
 341         for (i = 0; i < binary->global_symbol_count; ++i) {
 342                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 343                         unsigned offset = i * binary->config_size_per_symbol;
 344                         return binary->config + offset;
 345                 }
 346         }
 347         return binary->config;
 348 }
 349
 350 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 351                                            struct r600_bytecode *bc,
 352                                            uint64_t symbol_offset,
 353                                            boolean *use_kill)
 354 {
 355        unsigned i;
 356        const unsigned char *config =
 357                r600_shader_binary_config_start(binary, symbol_offset);
 358
 359        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 360                unsigned reg =
 361                        util_le32_to_cpu(*(uint32_t*)(config + i));
 362                unsigned value =
 363                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 364                switch (reg) {
 365                /* R600 / R700 */
 366                case R_028850_SQ_PGM_RESOURCES_PS:
 367                case R_028868_SQ_PGM_RESOURCES_VS:
 368                /* Evergreen / Northern Islands */
 369                case R_028844_SQ_PGM_RESOURCES_PS:
 370                case R_028860_SQ_PGM_RESOURCES_VS:
 371                case R_0288D4_SQ_PGM_RESOURCES_LS:
 372                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 373                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 374                        break;
 375                case R_02880C_DB_SHADER_CONTROL:
 376                        *use_kill = G_02880C_KILL_ENABLE(value);
 377                        break;
 378                case R_0288E8_SQ_LDS_ALLOC:
 379                        bc->nlds_dw = value;
 380                        break;
 381                }
 382        }
 383 }
 384
 385 static unsigned r600_create_shader(struct r600_bytecode *bc,
 386                                    const struct ac_shader_binary *binary,
 387                                    boolean *use_kill)
 388
 389 {
 390         assert(binary->code_size % 4 == 0);
 391         bc->bytecode = CALLOC(1, binary->code_size);
 392         memcpy(bc->bytecode, binary->code, binary->code_size);
 393         bc->ndw = binary->code_size / 4;
 394
 395         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 396         return 0;
 397 }
 398
 399 #endif
 400
 401 static void r600_destroy_shader(struct r600_bytecode *bc)
 402 {
 403         FREE(bc->bytecode);
 404 }
 405
 406 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 407                                             const struct pipe_compute_state *cso)
 408 {
 409         struct r600_context *rctx = (struct r600_context *)ctx;
 410         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 411 #ifdef HAVE_OPENCL
 412         const struct pipe_llvm_program_header *header;
 413         const char *code;
 414         void *p;
 415         boolean use_kill;
 416 #endif
 417
 418         shader->ctx = rctx;
 419         shader->local_size = cso->req_local_mem;
 420         shader->private_size = cso->req_private_mem;
 421         shader->input_size = cso->req_input_mem;
 422
 423         shader->ir_type = cso->ir_type;
 424
 425         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 426                 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
 427                 return shader;
 428         }
 429 #ifdef HAVE_OPENCL
 430         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 431         header = cso->prog;
 432         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 433         radeon_shader_binary_init(&shader->binary);
 434         r600_elf_read(code, header->num_bytes, &shader->binary);
 435         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 436
 437         /* Upload code + ROdata */
 438         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 439                                                         shader->bc.ndw * 4);
 440         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 441         //TODO: use util_memcpy_cpu_to_le32 ?
 442         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 443         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 444 #endif
 445
 446         return shader;
 447 }
 448
 449 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 450 {
 451         struct r600_context *rctx = (struct r600_context *)ctx;
 452         struct r600_pipe_compute *shader = state;
 453
 454         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 455
 456         if (!shader)
 457                 return;
 458
 459         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 460                 r600_delete_shader_selector(ctx, shader->sel);
 461         } else {
 462 #ifdef HAVE_OPENCL
 463                 radeon_shader_binary_clean(&shader->binary);
 464 #endif
 465                 r600_destroy_shader(&shader->bc);
 466
 467                 /* TODO destroy shader->code_bo, shader->const_bo
 468                  * we'll need something like r600_buffer_free */
 469         }
 470         FREE(shader);
 471 }
 472
 473 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 474 {
 475         struct r600_context *rctx = (struct r600_context *)ctx;
 476         struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
 477         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 478
 479         if (!state) {
 480                 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 481                 return;
 482         }
 483
 484         if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
 485                 bool compute_dirty;
 486
 487                 r600_shader_select(ctx, cstate->sel, &compute_dirty);
 488         }
 489
 490         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 491 }
 492
 493 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 494  * kernel parameters there are implicit parameters that need to be stored
 495  * in the vertex buffer as well.  Here is how these parameters are organized in
 496  * the buffer:
 497  *
 498  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 499  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 500  * DWORDS 6-8: Number of work items within each work group in each dimension
 501  *             (x,y,z)
 502  * DWORDS 9+ : Kernel parameters
 503  */
 504 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 505                                            const struct pipe_grid_info *info)
 506 {
 507         struct r600_context *rctx = (struct r600_context *)ctx;
 508         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 509         unsigned i;
 510         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 511          * parameters.
 512          */
 513         unsigned input_size;
 514         uint32_t *num_work_groups_start;
 515         uint32_t *global_size_start;
 516         uint32_t *local_size_start;
 517         uint32_t *kernel_parameters_start;
 518         struct pipe_box box;
 519         struct pipe_transfer *transfer = NULL;
 520
 521         if (!shader)
 522                 return;
 523         if (shader->input_size == 0) {
 524                 return;
 525         }
 526         input_size = shader->input_size + 36;
 527         if (!shader->kernel_param) {
 528                 /* Add space for the grid dimensions */
 529                 shader->kernel_param = (struct r600_resource *)
 530                         pipe_buffer_create(ctx->screen, 0,
 531                                         PIPE_USAGE_IMMUTABLE, input_size);
 532         }
 533
 534         u_box_1d(0, input_size, &box);
 535         num_work_groups_start = ctx->transfer_map(ctx,
 536                         (struct pipe_resource*)shader->kernel_param,
 537                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 538                         &box, &transfer);
 539         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 540         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 541         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 542
 543         /* Copy the work group size */
 544         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 545
 546         /* Copy the global size */
 547         for (i = 0; i < 3; i++) {
 548                 global_size_start[i] = info->grid[i] * info->block[i];
 549         }
 550
 551         /* Copy the local dimensions */
 552         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 553
 554         /* Copy the kernel inputs */
 555         memcpy(kernel_parameters_start, info->input, shader->input_size);
 556
 557         for (i = 0; i < (input_size / 4); i++) {
 558                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 559                         ((unsigned*)num_work_groups_start)[i]);
 560         }
 561
 562         ctx->transfer_unmap(ctx, transfer);
 563
 564         /* ID=0 and ID=3 are reserved for the parameters.
 565          * LLVM will preferably use ID=0, but it does not work for dynamic
 566          * indices. */
 567         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 568                         (struct pipe_resource*)shader->kernel_param);
 569         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 570                         (struct pipe_resource*)shader->kernel_param);
 571 }
 572
 573 static void evergreen_emit_dispatch(struct r600_context *rctx,
 574                                     const struct pipe_grid_info *info,
 575                                     uint32_t indirect_grid[3])
 576 {
 577         int i;
 578         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 579         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 580         unsigned num_waves;
 581         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 582         unsigned wave_divisor = (16 * num_pipes);
 583         int group_size = 1;
 584         int grid_size = 1;
 585         unsigned lds_size = shader->local_size / 4;
 586
 587         if (shader->ir_type != PIPE_SHADER_IR_TGSI)
 588                 lds_size += shader->bc.nlds_dw;
 589
 590         /* Calculate group_size/grid_size */
 591         for (i = 0; i < 3; i++) {
 592                 group_size *= info->block[i];
 593         }
 594
 595         for (i = 0; i < 3; i++) {
 596                 grid_size *= info->grid[i];
 597         }
 598
 599         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 600         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 601                         wave_divisor - 1) / wave_divisor;
 602
 603         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 604                                 "%u wavefronts per thread block, "
 605                                 "allocating %u dwords lds.\n",
 606                                 num_pipes, num_waves, lds_size);
 607
 608         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 609
 610         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 611         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 612         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 613         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 614
 615         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 616                                                                 group_size);
 617
 618         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 619         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 620         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 621         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 622
 623         if (rctx->b.chip_class < CAYMAN) {
 624                 assert(lds_size <= 8192);
 625         } else {
 626                 /* Cayman appears to have a slightly smaller limit, see the
 627                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 628                 assert(lds_size <= 8160);
 629         }
 630
 631         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 632                                         lds_size | (num_waves << 14));
 633
 634         if (info->indirect) {
 635                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 636                 radeon_emit(cs, indirect_grid[0]);
 637                 radeon_emit(cs, indirect_grid[1]);
 638                 radeon_emit(cs, indirect_grid[2]);
 639                 radeon_emit(cs, 1);
 640         } else {
 641                 /* Dispatch packet */
 642                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 643                 radeon_emit(cs, info->grid[0]);
 644                 radeon_emit(cs, info->grid[1]);
 645                 radeon_emit(cs, info->grid[2]);
 646                 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 647                 radeon_emit(cs, 1);
 648         }
 649
 650         if (rctx->is_debug)
 651                 eg_trace_emit(rctx);
 652 }
 653
 654 static void compute_setup_cbs(struct r600_context *rctx)
 655 {
 656         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 657         unsigned i;
 658
 659         /* Emit colorbuffers. */
 660         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 661         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 662                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 663                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 664                                                        (struct r600_resource*)cb->base.texture,
 665                                                        RADEON_USAGE_READWRITE,
 666                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 667
 668                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 669                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 670                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 671                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 672                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 673                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 674                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 675                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 676
 677                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 678                 radeon_emit(cs, reloc);
 679
 680                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 681                 radeon_emit(cs, reloc);
 682         }
 683         for (; i < 8 ; i++)
 684                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 685                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 686         for (; i < 12; i++)
 687                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 688                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 689
 690         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 691         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 692                                        rctx->compute_cb_target_mask);
 693 }
 694
 695 static void compute_emit_cs(struct r600_context *rctx,
 696                             const struct pipe_grid_info *info)
 697 {
 698         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 699         bool compute_dirty = false;
 700         struct r600_pipe_shader *current;
 701         struct r600_shader_atomic combined_atomics[8];
 702         uint8_t atomic_used_mask;
 703         uint32_t indirect_grid[3] = { 0, 0, 0 };
 704
 705         /* make sure that the gfx ring is only one active */
 706         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 707                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 708         }
 709
 710         r600_update_compressed_resource_state(rctx, true);
 711
 712         if (!rctx->cmd_buf_is_compute) {
 713                 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 714                 rctx->cmd_buf_is_compute = true;
 715         }
 716
 717         r600_need_cs_space(rctx, 0, true);
 718         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 719                 r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
 720                 current = rctx->cs_shader_state.shader->sel->current;
 721                 if (compute_dirty) {
 722                         rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
 723                         r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
 724                         r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
 725                 }
 726
 727                 bool need_buf_const = current->shader.uses_tex_buffers ||
 728                         current->shader.has_txq_cube_array_z_comp;
 729
 730                 if (info->indirect) {
 731                         struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
 732                         unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
 733                         unsigned offset = info->indirect_offset / 4;
 734                         indirect_grid[0] = data[offset];
 735                         indirect_grid[1] = data[offset + 1];
 736                         indirect_grid[2] = data[offset + 2];
 737                 }
 738                 for (int i = 0; i < 3; i++) {
 739                         rctx->cs_block_grid_sizes[i] = info->block[i];
 740                         rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
 741                 }
 742                 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 743                 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
 744                 if (need_buf_const) {
 745                         eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 746                 }
 747                 r600_update_driver_const_buffers(rctx, true);
 748
 749                 if (evergreen_emit_atomic_buffer_setup(rctx, current, combined_atomics, &atomic_used_mask)) {
 750                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 751                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 752                 }
 753         }
 754
 755         /* Initialize all the compute-related registers.
 756          *
 757          * See evergreen_init_atom_start_compute_cs() in this file for the list
 758          * of registers initialized by the start_compute_cs_cmd atom.
 759          */
 760         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 761
 762         /* emit config state */
 763         if (rctx->b.chip_class == EVERGREEN) {
 764                 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 765                         radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
 766                         radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
 767                         radeon_emit(cs, 0);
 768                         radeon_emit(cs, 0);
 769                         radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
 770                 } else
 771                         r600_emit_atom(rctx, &rctx->config_state.atom);
 772         }
 773
 774         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 775         r600_flush_emit(rctx);
 776
 777         if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
 778
 779                 compute_setup_cbs(rctx);
 780
 781                 /* Emit vertex buffer state */
 782                 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 783                 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 784         } else {
 785                 uint32_t rat_mask;
 786
 787                 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
 788                 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 789                                                rat_mask);
 790         }
 791
 792         /* Emit constant buffer state */
 793         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 794
 795         /* Emit sampler state */
 796         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 797
 798         /* Emit sampler view (texture resource) state */
 799         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 800
 801         /* Emit images state */
 802         r600_emit_atom(rctx, &rctx->compute_images.atom);
 803
 804         /* Emit buffers state */
 805         r600_emit_atom(rctx, &rctx->compute_buffers.atom);
 806
 807         /* Emit shader state */
 808         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 809
 810         /* Emit dispatch state and dispatch packet */
 811         evergreen_emit_dispatch(rctx, info, indirect_grid);
 812
 813         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 814          */
 815         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 816                       R600_CONTEXT_INV_VERTEX_CACHE |
 817                       R600_CONTEXT_INV_TEX_CACHE;
 818         r600_flush_emit(rctx);
 819         rctx->b.flags = 0;
 820
 821         if (rctx->b.chip_class >= CAYMAN) {
 822                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 823                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 824                 /* DEALLOC_STATE prevents the GPU from hanging when a
 825                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 826                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 827                  */
 828                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 829                 radeon_emit(cs, 0);
 830         }
 831         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
 832                 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
 833
 834 #if 0
 835         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 836         for (i = 0; i < cs->cdw; i++) {
 837                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 838         }
 839 #endif
 840
 841 }
 842
 843
 844 /**
 845  * Emit function for r600_cs_shader_state atom
 846  */
 847 void evergreen_emit_cs_shader(struct r600_context *rctx,
 848                               struct r600_atom *atom)
 849 {
 850         struct r600_cs_shader_state *state =
 851                                         (struct r600_cs_shader_state*)atom;
 852         struct r600_pipe_compute *shader = state->shader;
 853         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 854         uint64_t va;
 855         struct r600_resource *code_bo;
 856         unsigned ngpr, nstack;
 857
 858         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 859                 code_bo = shader->sel->current->bo;
 860                 va = shader->sel->current->bo->gpu_address;
 861                 ngpr = shader->sel->current->shader.bc.ngpr;
 862                 nstack = shader->sel->current->shader.bc.nstack;
 863         } else {
 864                 code_bo = shader->code_bo;
 865                 va = shader->code_bo->gpu_address + state->pc;
 866                 ngpr = shader->bc.ngpr;
 867                 nstack = shader->bc.nstack;
 868         }
 869
 870         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 871         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 872         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 873                         S_0288D4_NUM_GPRS(ngpr) |
 874                         S_0288D4_DX10_CLAMP(1) |
 875                         S_0288D4_STACK_SIZE(nstack));
 876         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 877
 878         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 879         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 880                                               code_bo, RADEON_USAGE_READ,
 881                                               RADEON_PRIO_SHADER_BINARY));
 882 }
 883
 884 static void evergreen_launch_grid(struct pipe_context *ctx,
 885                                   const struct pipe_grid_info *info)
 886 {
 887         struct r600_context *rctx = (struct r600_context *)ctx;
 888 #ifdef HAVE_OPENCL
 889         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 890         boolean use_kill;
 891
 892         if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
 893                 rctx->cs_shader_state.pc = info->pc;
 894                 /* Get the config information for this kernel. */
 895                 r600_shader_binary_read_config(&shader->binary, &shader->bc,
 896                                                info->pc, &use_kill);
 897         } else {
 898                 use_kill = false;
 899                 rctx->cs_shader_state.pc = 0;
 900         }
 901 #endif
 902
 903         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 904
 905
 906         evergreen_compute_upload_input(ctx, info);
 907         compute_emit_cs(rctx, info);
 908 }
 909
 910 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 911                                             unsigned start, unsigned count,
 912                                             struct pipe_surface **surfaces)
 913 {
 914         struct r600_context *rctx = (struct r600_context *)ctx;
 915         struct r600_surface **resources = (struct r600_surface **)surfaces;
 916
 917         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 918                         start, count);
 919
 920         for (unsigned i = 0; i < count; i++) {
 921                 /* The First four vertex buffers are reserved for parameters and
 922                  * global buffers. */
 923                 unsigned vtx_id = 4 + i;
 924                 if (resources[i]) {
 925                         struct r600_resource_global *buffer =
 926                                 (struct r600_resource_global*)
 927                                 resources[i]->base.texture;
 928                         if (resources[i]->base.writable) {
 929                                 assert(i+1 < 12);
 930
 931                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 932                                 (struct r600_resource *)resources[i]->base.texture,
 933                                 buffer->chunk->start_in_dw*4,
 934                                 resources[i]->base.texture->width0);
 935                         }
 936
 937                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 938                                         buffer->chunk->start_in_dw * 4,
 939                                         resources[i]->base.texture);
 940                 }
 941         }
 942 }
 943
 944 static void evergreen_set_global_binding(struct pipe_context *ctx,
 945                                          unsigned first, unsigned n,
 946                                          struct pipe_resource **resources,
 947                                          uint32_t **handles)
 948 {
 949         struct r600_context *rctx = (struct r600_context *)ctx;
 950         struct compute_memory_pool *pool = rctx->screen->global_pool;
 951         struct r600_resource_global **buffers =
 952                 (struct r600_resource_global **)resources;
 953         unsigned i;
 954
 955         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 956                         first, n);
 957
 958         if (!resources) {
 959                 /* XXX: Unset */
 960                 return;
 961         }
 962
 963         /* We mark these items for promotion to the pool if they
 964          * aren't already there */
 965         for (i = first; i < first + n; i++) {
 966                 struct compute_memory_item *item = buffers[i]->chunk;
 967
 968                 if (!is_item_in_pool(item))
 969                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 970         }
 971
 972         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 973                 /* XXX: Unset */
 974                 return;
 975         }
 976
 977         for (i = first; i < first + n; i++)
 978         {
 979                 uint32_t buffer_offset;
 980                 uint32_t handle;
 981                 assert(resources[i]->target == PIPE_BUFFER);
 982                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 983
 984                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 985                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 986
 987                 *(handles[i]) = util_cpu_to_le32(handle);
 988         }
 989
 990         /* globals for writing */
 991         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 992         /* globals for reading */
 993         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 994                                 (struct pipe_resource*)pool->bo);
 995
 996         /* constants for reading, LLVM puts them in text segment */
 997         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 998                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 999 }
1000
1001 /**
1002  * This function initializes all the compute specific registers that need to
1003  * be initialized for each compute command stream.  Registers that are common
1004  * to both compute and 3D will be initialized at the beginning of each compute
1005  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1006  * packet requires that the shader type bit be set, we must initialize all
1007  * context registers needed for compute in this function.  The registers
1008  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1009  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1010  * on the GPU family.
1011  */
1012 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1013 {
1014         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1015         int num_threads;
1016         int num_stack_entries;
1017
1018         /* since all required registers are initialized in the
1019          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1020          */
1021         r600_init_command_buffer(cb, 256);
1022         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1023
1024         /* We're setting config registers here. */
1025         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1026         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1027
1028         switch (rctx->b.family) {
1029         case CHIP_CEDAR:
1030         default:
1031                 num_threads = 128;
1032                 num_stack_entries = 256;
1033                 break;
1034         case CHIP_REDWOOD:
1035                 num_threads = 128;
1036                 num_stack_entries = 256;
1037                 break;
1038         case CHIP_JUNIPER:
1039                 num_threads = 128;
1040                 num_stack_entries = 512;
1041                 break;
1042         case CHIP_CYPRESS:
1043         case CHIP_HEMLOCK:
1044                 num_threads = 128;
1045                 num_stack_entries = 512;
1046                 break;
1047         case CHIP_PALM:
1048                 num_threads = 128;
1049                 num_stack_entries = 256;
1050                 break;
1051         case CHIP_SUMO:
1052                 num_threads = 128;
1053                 num_stack_entries = 256;
1054                 break;
1055         case CHIP_SUMO2:
1056                 num_threads = 128;
1057                 num_stack_entries = 512;
1058                 break;
1059         case CHIP_BARTS:
1060                 num_threads = 128;
1061                 num_stack_entries = 512;
1062                 break;
1063         case CHIP_TURKS:
1064                 num_threads = 128;
1065                 num_stack_entries = 256;
1066                 break;
1067         case CHIP_CAICOS:
1068                 num_threads = 128;
1069                 num_stack_entries = 256;
1070                 break;
1071         }
1072
1073         /* The primitive type always needs to be POINTLIST for compute. */
1074         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1075                                                 V_008958_DI_PT_POINTLIST);
1076
1077         if (rctx->b.chip_class < CAYMAN) {
1078
1079                 /* These registers control which simds can be used by each stage.
1080                  * The default for these registers is 0xffffffff, which means
1081                  * all simds are available for each stage.  It's possible we may
1082                  * want to play around with these in the future, but for now
1083                  * the default value is fine.
1084                  *
1085                  * R_008E20_SQ_STATIC_THREAD_MGMT1
1086                  * R_008E24_SQ_STATIC_THREAD_MGMT2
1087                  * R_008E28_SQ_STATIC_THREAD_MGMT3
1088                  */
1089
1090                 /* XXX: We may need to adjust the thread and stack resource
1091                  * values for 3D/compute interop */
1092
1093                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1094
1095                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1096                  * Set the number of threads used by the PS/VS/GS/ES stage to
1097                  * 0.
1098                  */
1099                 r600_store_value(cb, 0);
1100
1101                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1102                  * Set the number of threads used by the CS (aka LS) stage to
1103                  * the maximum number of threads and set the number of threads
1104                  * for the HS stage to 0. */
1105                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1106
1107                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1108                  * Set the Control Flow stack entries to 0 for PS/VS stages */
1109                 r600_store_value(cb, 0);
1110
1111                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1112                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1113                 r600_store_value(cb, 0);
1114
1115                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1116                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1117                  * set it to the maximum value for the CS (aka LS) stage. */
1118                 r600_store_value(cb,
1119                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1120         }
1121         /* Give the compute shader all the available LDS space.
1122          * NOTE: This only sets the maximum number of dwords that a compute
1123          * shader can allocate.  When a shader is executed, we still need to
1124          * allocate the appropriate amount of LDS dwords using the
1125          * CM_R_0288E8_SQ_LDS_ALLOC register.
1126          */
1127         if (rctx->b.chip_class < CAYMAN) {
1128                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1129                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1130         } else {
1131                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1132                         S_0286FC_NUM_PS_LDS(0) |
1133                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1134         }
1135
1136         /* Context Registers */
1137
1138         if (rctx->b.chip_class < CAYMAN) {
1139                 /* workaround for hw issues with dyn gpr - must set all limits
1140                  * to 240 instead of 0, 0x1e == 240 / 8
1141                  */
1142                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1143                                 S_028838_PS_GPRS(0x1e) |
1144                                 S_028838_VS_GPRS(0x1e) |
1145                                 S_028838_GS_GPRS(0x1e) |
1146                                 S_028838_ES_GPRS(0x1e) |
1147                                 S_028838_HS_GPRS(0x1e) |
1148                                 S_028838_LS_GPRS(0x1e));
1149         }
1150
1151         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1152         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1153                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1154
1155         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1156
1157         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1158                                S_0286E8_TID_IN_GROUP_ENA(1) |
1159                                S_0286E8_TGID_ENA(1) |
1160                                S_0286E8_DISABLE_INDEX_PACK(1));
1161
1162         /* The LOOP_CONST registers are an optimizations for loops that allows
1163          * you to store the initial counter, increment value, and maximum
1164          * counter value in a register so that hardware can calculate the
1165          * correct number of iterations for the loop, so that you don't need
1166          * to have the loop counter in your shader code.  We don't currently use
1167          * this optimization, so we must keep track of the counter in the
1168          * shader and use a break instruction to exit loops.  However, the
1169          * hardware will still uses this register to determine when to exit a
1170          * loop, so we need to initialize the counter to 0, set the increment
1171          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1172          * is the maximum value allowed.  This gives us a maximum of 4096
1173          * iterations for our loops, but hopefully our break instruction will
1174          * execute before some time before the 4096th iteration.
1175          */
1176         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1177 }
1178
1179 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1180 {
1181         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1182         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1183         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1184 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1185         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1186         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1187         rctx->b.b.launch_grid = evergreen_launch_grid;
1188
1189 }
1190
1191 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1192                                               struct pipe_resource *resource,
1193                                               unsigned level,
1194                                               unsigned usage,
1195                                               const struct pipe_box *box,
1196                                               struct pipe_transfer **ptransfer)
1197 {
1198         struct r600_context *rctx = (struct r600_context*)ctx;
1199         struct compute_memory_pool *pool = rctx->screen->global_pool;
1200         struct r600_resource_global* buffer =
1201                 (struct r600_resource_global*)resource;
1202
1203         struct compute_memory_item *item = buffer->chunk;
1204         struct pipe_resource *dst = NULL;
1205         unsigned offset = box->x;
1206
1207         if (is_item_in_pool(item)) {
1208                 compute_memory_demote_item(pool, item, ctx);
1209         }
1210         else {
1211                 if (item->real_buffer == NULL) {
1212                         item->real_buffer =
1213                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1214                 }
1215         }
1216
1217         dst = (struct pipe_resource*)item->real_buffer;
1218
1219         if (usage & PIPE_TRANSFER_READ)
1220                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1221
1222         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1223                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1224                         "width = %u, height = %u, depth = %u)\n", level, usage,
1225                         box->x, box->y, box->z, box->width, box->height,
1226                         box->depth);
1227         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1228                 "%u (box.x)\n", item->id, box->x);
1229
1230
1231         assert(resource->target == PIPE_BUFFER);
1232         assert(resource->bind & PIPE_BIND_GLOBAL);
1233         assert(box->x >= 0);
1234         assert(box->y == 0);
1235         assert(box->z == 0);
1236
1237         ///TODO: do it better, mapping is not possible if the pool is too big
1238         return pipe_buffer_map_range(ctx, dst,
1239                         offset, box->width, usage, ptransfer);
1240 }
1241
1242 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1243                                                struct pipe_transfer *transfer)
1244 {
1245         /* struct r600_resource_global are not real resources, they just map
1246          * to an offset within the compute memory pool.  The function
1247          * r600_compute_global_transfer_map() maps the memory pool
1248          * resource rather than the struct r600_resource_global passed to
1249          * it as an argument and then initalizes ptransfer->resource with
1250          * the memory pool resource (via pipe_buffer_map_range).
1251          * When transfer_unmap is called it uses the memory pool's
1252          * vtable which calls r600_buffer_transfer_map() rather than
1253          * this function.
1254          */
1255         assert (!"This function should not be called");
1256 }
1257
1258 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1259                                                       struct pipe_transfer *transfer,
1260                                                       const struct pipe_box *box)
1261 {
1262         assert(0 && "TODO");
1263 }
1264
1265 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1266                                                struct pipe_resource *res)
1267 {
1268         struct r600_resource_global* buffer = NULL;
1269         struct r600_screen* rscreen = NULL;
1270
1271         assert(res->target == PIPE_BUFFER);
1272         assert(res->bind & PIPE_BIND_GLOBAL);
1273
1274         buffer = (struct r600_resource_global*)res;
1275         rscreen = (struct r600_screen*)screen;
1276
1277         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1278
1279         buffer->chunk = NULL;
1280         free(res);
1281 }
1282
1283 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1284 {
1285         u_default_resource_get_handle, /* get_handle */
1286         r600_compute_global_buffer_destroy, /* resource_destroy */
1287         r600_compute_global_transfer_map, /* transfer_map */
1288         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1289         r600_compute_global_transfer_unmap, /* transfer_unmap */
1290 };
1291
1292 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1293                                                         const struct pipe_resource *templ)
1294 {
1295         struct r600_resource_global* result = NULL;
1296         struct r600_screen* rscreen = NULL;
1297         int size_in_dw = 0;
1298
1299         assert(templ->target == PIPE_BUFFER);
1300         assert(templ->bind & PIPE_BIND_GLOBAL);
1301         assert(templ->array_size == 1 || templ->array_size == 0);
1302         assert(templ->depth0 == 1 || templ->depth0 == 0);
1303         assert(templ->height0 == 1 || templ->height0 == 0);
1304
1305         result = (struct r600_resource_global*)
1306         CALLOC(sizeof(struct r600_resource_global), 1);
1307         rscreen = (struct r600_screen*)screen;
1308
1309         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1310         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1311                         templ->array_size);
1312
1313         result->base.b.vtbl = &r600_global_buffer_vtbl;
1314         result->base.b.b = *templ;
1315         result->base.b.b.screen = screen;
1316         pipe_reference_init(&result->base.b.b.reference, 1);
1317
1318         size_in_dw = (templ->width0+3) / 4;
1319
1320         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1321
1322         if (result->chunk == NULL)
1323         {
1324                 free(result);
1325                 return NULL;
1326         }
1327
1328         return &result->base.b.b;
1329 }