src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "tgsi/tgsi_parse.h"
  45 #include "pipebuffer/pb_buffer.h"
  46 #include "evergreend.h"
  47 #include "r600_shader.h"
  48 #include "r600_pipe.h"
  49 #include "r600_formats.h"
  50 #include "evergreen_compute.h"
  51 #include "evergreen_compute_internal.h"
  52 #include "compute_memory_pool.h"
  53 #include "sb/sb_public.h"
  54 #include <inttypes.h>
  55
  56 /**
  57 RAT0 is for global binding write
  58 VTX1 is for global binding read
  59
  60 for wrting images RAT1...
  61 for reading images TEX2...
  62   TEX2-RAT1 is paired
  63
  64 TEX2... consumes the same fetch resources, that VTX2... would consume
  65
  66 CONST0 and VTX0 is for parameters
  67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  68   also constant cached
  69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  70   the constant cache can handle
  71
  72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  74 we should reserve another one too.=> 10 image binding for writing max.
  75
  76 from Nvidia OpenCL:
  77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  79
  80 so 10 for writing is enough. 176 is the max for reading according to the docs
  81
  82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  83 writable images will consume TEX slots, VTX slots too because of linear indexing
  84
  85 */
  86
  87 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  88                                                      unsigned size)
  89 {
  90         struct pipe_resource *buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  94                                     0, PIPE_USAGE_IMMUTABLE, size);
  95
  96         return (struct r600_resource *)buffer;
  97 }
  98
  99
 100 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 101                               unsigned id,
 102                               struct r600_resource *bo,
 103                               int start,
 104                               int size)
 105 {
 106         struct pipe_surface rat_templ;
 107         struct r600_surface *surf = NULL;
 108         struct r600_context *rctx = NULL;
 109
 110         assert(id < 12);
 111         assert((size & 3) == 0);
 112         assert((start & 0xFF) == 0);
 113
 114         rctx = pipe->ctx;
 115
 116         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 117
 118         /* Create the RAT surface */
 119         memset(&rat_templ, 0, sizeof(rat_templ));
 120         rat_templ.format = PIPE_FORMAT_R32_UINT;
 121         rat_templ.u.tex.level = 0;
 122         rat_templ.u.tex.first_layer = 0;
 123         rat_templ.u.tex.last_layer = 0;
 124
 125         /* Add the RAT the list of color buffers */
 126         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 127                 (struct pipe_context *)pipe->ctx,
 128                 (struct pipe_resource *)bo, &rat_templ);
 129
 130         /* Update the number of color buffers */
 131         pipe->ctx->framebuffer.state.nr_cbufs =
 132                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 133
 134         /* Update the cb_target_mask
 135          * XXX: I think this is a potential spot for bugs once we start doing
 136          * GL interop.  cb_target_mask may be modified in the 3D sections
 137          * of this driver. */
 138         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 139
 140         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 141         evergreen_init_color_surface_rat(rctx, surf);
 142 }
 143
 144 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 145                                            unsigned vb_index,
 146                                            unsigned offset,
 147                                            struct pipe_resource *buffer)
 148 {
 149         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 150         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 151         vb->stride = 1;
 152         vb->buffer_offset = offset;
 153         vb->buffer.resource = buffer;
 154         vb->is_user_buffer = false;
 155
 156         /* The vertex instructions in the compute shaders use the texture cache,
 157          * so we need to invalidate it. */
 158         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 159         state->enabled_mask |= 1 << vb_index;
 160         state->dirty_mask |= 1 << vb_index;
 161         r600_mark_atom_dirty(rctx, &state->atom);
 162 }
 163
 164 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 165                                              unsigned cb_index,
 166                                              unsigned offset,
 167                                              unsigned size,
 168                                              struct pipe_resource *buffer)
 169 {
 170         struct pipe_constant_buffer cb;
 171         cb.buffer_size = size;
 172         cb.buffer_offset = offset;
 173         cb.buffer = buffer;
 174         cb.user_buffer = NULL;
 175
 176         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 177 }
 178
 179 /* We need to define these R600 registers here, because we can't include
 180  * evergreend.h and r600d.h.
 181  */
 182 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 183 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 184
 185 #ifdef HAVE_OPENCL
 186 static void parse_symbol_table(Elf_Data *symbol_table_data,
 187                                 const GElf_Shdr *symbol_table_header,
 188                                 struct ac_shader_binary *binary)
 189 {
 190         GElf_Sym symbol;
 191         unsigned i = 0;
 192         unsigned symbol_count =
 193                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 194
 195         /* We are over allocating this list, because symbol_count gives the
 196          * total number of symbols, and we will only be filling the list
 197          * with offsets of global symbols.  The memory savings from
 198          * allocating the correct size of this list will be small, and
 199          * I don't think it is worth the cost of pre-computing the number
 200          * of global symbols.
 201          */
 202         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 203
 204         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 205                 unsigned i;
 206                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 207                     symbol.st_shndx == 0 /* Undefined symbol */) {
 208                         continue;
 209                 }
 210
 211                 binary->global_symbol_offsets[binary->global_symbol_count] =
 212                                         symbol.st_value;
 213
 214                 /* Sort the list using bubble sort.  This list will usually
 215                  * be small. */
 216                 for (i = binary->global_symbol_count; i > 0; --i) {
 217                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 218                         uint64_t rhs = binary->global_symbol_offsets[i];
 219                         if (lhs < rhs) {
 220                                 break;
 221                         }
 222                         binary->global_symbol_offsets[i] = lhs;
 223                         binary->global_symbol_offsets[i - 1] = rhs;
 224                 }
 225                 ++binary->global_symbol_count;
 226         }
 227 }
 228
 229
 230 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 231                         unsigned symbol_sh_link,
 232                         struct ac_shader_binary *binary)
 233 {
 234         unsigned i;
 235
 236         if (!relocs || !symbols || !binary->reloc_count) {
 237                 return;
 238         }
 239         binary->relocs = CALLOC(binary->reloc_count,
 240                         sizeof(struct ac_shader_reloc));
 241         for (i = 0; i < binary->reloc_count; i++) {
 242                 GElf_Sym symbol;
 243                 GElf_Rel rel;
 244                 char *symbol_name;
 245                 struct ac_shader_reloc *reloc = &binary->relocs[i];
 246
 247                 gelf_getrel(relocs, i, &rel);
 248                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 249                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 250
 251                 reloc->offset = rel.r_offset;
 252                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 253                 reloc->name[sizeof(reloc->name)-1] = 0;
 254         }
 255 }
 256
 257 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 258                  struct ac_shader_binary *binary)
 259 {
 260         char *elf_buffer;
 261         Elf *elf;
 262         Elf_Scn *section = NULL;
 263         Elf_Data *symbols = NULL, *relocs = NULL;
 264         size_t section_str_index;
 265         unsigned symbol_sh_link = 0;
 266
 267         /* One of the libelf implementations
 268          * (http://www.mr511.de/software/english.htm) requires calling
 269          * elf_version() before elf_memory().
 270          */
 271         elf_version(EV_CURRENT);
 272         elf_buffer = MALLOC(elf_size);
 273         memcpy(elf_buffer, elf_data, elf_size);
 274
 275         elf = elf_memory(elf_buffer, elf_size);
 276
 277         elf_getshdrstrndx(elf, &section_str_index);
 278
 279         while ((section = elf_nextscn(elf, section))) {
 280                 const char *name;
 281                 Elf_Data *section_data = NULL;
 282                 GElf_Shdr section_header;
 283                 if (gelf_getshdr(section, &section_header) != &section_header) {
 284                         fprintf(stderr, "Failed to read ELF section header\n");
 285                         return;
 286                 }
 287                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 288                 if (!strcmp(name, ".text")) {
 289                         section_data = elf_getdata(section, section_data);
 290                         binary->code_size = section_data->d_size;
 291                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 292                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 293                 } else if (!strcmp(name, ".AMDGPU.config")) {
 294                         section_data = elf_getdata(section, section_data);
 295                         binary->config_size = section_data->d_size;
 296                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 297                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 298                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 299                         /* Always read disassembly if it's available. */
 300                         section_data = elf_getdata(section, section_data);
 301                         binary->disasm_string = strndup(section_data->d_buf,
 302                                                         section_data->d_size);
 303                 } else if (!strncmp(name, ".rodata", 7)) {
 304                         section_data = elf_getdata(section, section_data);
 305                         binary->rodata_size = section_data->d_size;
 306                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 307                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 308                 } else if (!strncmp(name, ".symtab", 7)) {
 309                         symbols = elf_getdata(section, section_data);
 310                         symbol_sh_link = section_header.sh_link;
 311                         parse_symbol_table(symbols, &section_header, binary);
 312                 } else if (!strcmp(name, ".rel.text")) {
 313                         relocs = elf_getdata(section, section_data);
 314                         binary->reloc_count = section_header.sh_size /
 315                                         section_header.sh_entsize;
 316                 }
 317         }
 318
 319         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 320
 321         if (elf){
 322                 elf_end(elf);
 323         }
 324         FREE(elf_buffer);
 325
 326         /* Cache the config size per symbol */
 327         if (binary->global_symbol_count) {
 328                 binary->config_size_per_symbol =
 329                         binary->config_size / binary->global_symbol_count;
 330         } else {
 331                 binary->global_symbol_count = 1;
 332                 binary->config_size_per_symbol = binary->config_size;
 333         }
 334 }
 335
 336 static const unsigned char *r600_shader_binary_config_start(
 337         const struct ac_shader_binary *binary,
 338         uint64_t symbol_offset)
 339 {
 340         unsigned i;
 341         for (i = 0; i < binary->global_symbol_count; ++i) {
 342                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 343                         unsigned offset = i * binary->config_size_per_symbol;
 344                         return binary->config + offset;
 345                 }
 346         }
 347         return binary->config;
 348 }
 349
 350 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 351                                            struct r600_bytecode *bc,
 352                                            uint64_t symbol_offset,
 353                                            boolean *use_kill)
 354 {
 355        unsigned i;
 356        const unsigned char *config =
 357                r600_shader_binary_config_start(binary, symbol_offset);
 358
 359        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 360                unsigned reg =
 361                        util_le32_to_cpu(*(uint32_t*)(config + i));
 362                unsigned value =
 363                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 364                switch (reg) {
 365                /* R600 / R700 */
 366                case R_028850_SQ_PGM_RESOURCES_PS:
 367                case R_028868_SQ_PGM_RESOURCES_VS:
 368                /* Evergreen / Northern Islands */
 369                case R_028844_SQ_PGM_RESOURCES_PS:
 370                case R_028860_SQ_PGM_RESOURCES_VS:
 371                case R_0288D4_SQ_PGM_RESOURCES_LS:
 372                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 373                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 374                        break;
 375                case R_02880C_DB_SHADER_CONTROL:
 376                        *use_kill = G_02880C_KILL_ENABLE(value);
 377                        break;
 378                case R_0288E8_SQ_LDS_ALLOC:
 379                        bc->nlds_dw = value;
 380                        break;
 381                }
 382        }
 383 }
 384
 385 static unsigned r600_create_shader(struct r600_bytecode *bc,
 386                                    const struct ac_shader_binary *binary,
 387                                    boolean *use_kill)
 388
 389 {
 390         assert(binary->code_size % 4 == 0);
 391         bc->bytecode = CALLOC(1, binary->code_size);
 392         memcpy(bc->bytecode, binary->code, binary->code_size);
 393         bc->ndw = binary->code_size / 4;
 394
 395         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 396         return 0;
 397 }
 398
 399 #endif
 400
 401 static void r600_destroy_shader(struct r600_bytecode *bc)
 402 {
 403         FREE(bc->bytecode);
 404 }
 405
 406 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 407                                             const struct pipe_compute_state *cso)
 408 {
 409         struct r600_context *rctx = (struct r600_context *)ctx;
 410         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 411 #ifdef HAVE_OPENCL
 412         const struct pipe_llvm_program_header *header;
 413         const char *code;
 414         void *p;
 415         boolean use_kill;
 416 #endif
 417
 418         shader->ctx = rctx;
 419         shader->local_size = cso->req_local_mem;
 420         shader->private_size = cso->req_private_mem;
 421         shader->input_size = cso->req_input_mem;
 422
 423         shader->ir_type = cso->ir_type;
 424
 425         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 426                 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
 427                 return shader;
 428         }
 429 #ifdef HAVE_OPENCL
 430         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 431         header = cso->prog;
 432         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 433         radeon_shader_binary_init(&shader->binary);
 434         r600_elf_read(code, header->num_bytes, &shader->binary);
 435         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 436
 437         /* Upload code + ROdata */
 438         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 439                                                         shader->bc.ndw * 4);
 440         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 441         //TODO: use util_memcpy_cpu_to_le32 ?
 442         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 443         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 444 #endif
 445
 446         return shader;
 447 }
 448
 449 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 450 {
 451         struct r600_context *rctx = (struct r600_context *)ctx;
 452         struct r600_pipe_compute *shader = state;
 453
 454         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 455
 456         if (!shader)
 457                 return;
 458
 459         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 460                 r600_delete_shader_selector(ctx, shader->sel);
 461         } else {
 462 #ifdef HAVE_OPENCL
 463                 radeon_shader_binary_clean(&shader->binary);
 464 #endif
 465                 r600_destroy_shader(&shader->bc);
 466
 467                 /* TODO destroy shader->code_bo, shader->const_bo
 468                  * we'll need something like r600_buffer_free */
 469         }
 470         FREE(shader);
 471 }
 472
 473 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 474 {
 475         struct r600_context *rctx = (struct r600_context *)ctx;
 476         struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
 477         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 478
 479         if (!state) {
 480                 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 481                 return;
 482         }
 483
 484         if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
 485                 bool compute_dirty;
 486
 487                 r600_shader_select(ctx, cstate->sel, &compute_dirty);
 488         }
 489
 490         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 491 }
 492
 493 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 494  * kernel parameters there are implicit parameters that need to be stored
 495  * in the vertex buffer as well.  Here is how these parameters are organized in
 496  * the buffer:
 497  *
 498  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 499  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 500  * DWORDS 6-8: Number of work items within each work group in each dimension
 501  *             (x,y,z)
 502  * DWORDS 9+ : Kernel parameters
 503  */
 504 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 505                                            const struct pipe_grid_info *info)
 506 {
 507         struct r600_context *rctx = (struct r600_context *)ctx;
 508         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 509         unsigned i;
 510         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 511          * parameters.
 512          */
 513         unsigned input_size;
 514         uint32_t *num_work_groups_start;
 515         uint32_t *global_size_start;
 516         uint32_t *local_size_start;
 517         uint32_t *kernel_parameters_start;
 518         struct pipe_box box;
 519         struct pipe_transfer *transfer = NULL;
 520
 521         if (!shader)
 522                 return;
 523         if (shader->input_size == 0) {
 524                 return;
 525         }
 526         input_size = shader->input_size + 36;
 527         if (!shader->kernel_param) {
 528                 /* Add space for the grid dimensions */
 529                 shader->kernel_param = (struct r600_resource *)
 530                         pipe_buffer_create(ctx->screen, 0,
 531                                         PIPE_USAGE_IMMUTABLE, input_size);
 532         }
 533
 534         u_box_1d(0, input_size, &box);
 535         num_work_groups_start = ctx->transfer_map(ctx,
 536                         (struct pipe_resource*)shader->kernel_param,
 537                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 538                         &box, &transfer);
 539         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 540         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 541         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 542
 543         /* Copy the work group size */
 544         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 545
 546         /* Copy the global size */
 547         for (i = 0; i < 3; i++) {
 548                 global_size_start[i] = info->grid[i] * info->block[i];
 549         }
 550
 551         /* Copy the local dimensions */
 552         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 553
 554         /* Copy the kernel inputs */
 555         memcpy(kernel_parameters_start, info->input, shader->input_size);
 556
 557         for (i = 0; i < (input_size / 4); i++) {
 558                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 559                         ((unsigned*)num_work_groups_start)[i]);
 560         }
 561
 562         ctx->transfer_unmap(ctx, transfer);
 563
 564         /* ID=0 and ID=3 are reserved for the parameters.
 565          * LLVM will preferably use ID=0, but it does not work for dynamic
 566          * indices. */
 567         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 568                         (struct pipe_resource*)shader->kernel_param);
 569         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 570                         (struct pipe_resource*)shader->kernel_param);
 571 }
 572
 573 static void evergreen_emit_dispatch(struct r600_context *rctx,
 574                                     const struct pipe_grid_info *info,
 575                                     uint32_t indirect_grid[3])
 576 {
 577         int i;
 578         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 579         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 580         bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 581         unsigned num_waves;
 582         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 583         unsigned wave_divisor = (16 * num_pipes);
 584         int group_size = 1;
 585         int grid_size = 1;
 586         unsigned lds_size = shader->local_size / 4;
 587
 588         if (shader->ir_type != PIPE_SHADER_IR_TGSI)
 589                 lds_size += shader->bc.nlds_dw;
 590
 591         /* Calculate group_size/grid_size */
 592         for (i = 0; i < 3; i++) {
 593                 group_size *= info->block[i];
 594         }
 595
 596         for (i = 0; i < 3; i++) {
 597                 grid_size *= info->grid[i];
 598         }
 599
 600         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 601         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 602                         wave_divisor - 1) / wave_divisor;
 603
 604         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 605                                 "%u wavefronts per thread block, "
 606                                 "allocating %u dwords lds.\n",
 607                                 num_pipes, num_waves, lds_size);
 608
 609         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 610
 611         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 612         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 613         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 614         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 615
 616         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 617                                                                 group_size);
 618
 619         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 620         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 621         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 622         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 623
 624         if (rctx->b.chip_class < CAYMAN) {
 625                 assert(lds_size <= 8192);
 626         } else {
 627                 /* Cayman appears to have a slightly smaller limit, see the
 628                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 629                 assert(lds_size <= 8160);
 630         }
 631
 632         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 633                                         lds_size | (num_waves << 14));
 634
 635         if (info->indirect) {
 636                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 637                 radeon_emit(cs, indirect_grid[0]);
 638                 radeon_emit(cs, indirect_grid[1]);
 639                 radeon_emit(cs, indirect_grid[2]);
 640                 radeon_emit(cs, 1);
 641         } else {
 642                 /* Dispatch packet */
 643                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 644                 radeon_emit(cs, info->grid[0]);
 645                 radeon_emit(cs, info->grid[1]);
 646                 radeon_emit(cs, info->grid[2]);
 647                 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 648                 radeon_emit(cs, 1);
 649         }
 650
 651         if (rctx->is_debug)
 652                 eg_trace_emit(rctx);
 653 }
 654
 655 static void compute_setup_cbs(struct r600_context *rctx)
 656 {
 657         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 658         unsigned i;
 659
 660         /* Emit colorbuffers. */
 661         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 662         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 663                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 664                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 665                                                        (struct r600_resource*)cb->base.texture,
 666                                                        RADEON_USAGE_READWRITE,
 667                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 668
 669                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 670                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 671                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 672                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 673                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 674                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 675                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 676                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 677
 678                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 679                 radeon_emit(cs, reloc);
 680
 681                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 682                 radeon_emit(cs, reloc);
 683         }
 684         for (; i < 8 ; i++)
 685                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 686                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 687         for (; i < 12; i++)
 688                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 689                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 690
 691         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 692         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 693                                        rctx->compute_cb_target_mask);
 694 }
 695
 696 static void compute_emit_cs(struct r600_context *rctx,
 697                             const struct pipe_grid_info *info)
 698 {
 699         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 700         bool compute_dirty = false;
 701         struct r600_pipe_shader *current;
 702         struct r600_shader_atomic combined_atomics[8];
 703         uint8_t atomic_used_mask;
 704         uint32_t indirect_grid[3] = { 0, 0, 0 };
 705
 706         /* make sure that the gfx ring is only one active */
 707         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 708                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 709         }
 710
 711         r600_update_compressed_resource_state(rctx, true);
 712
 713         if (!rctx->cmd_buf_is_compute) {
 714                 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 715                 rctx->cmd_buf_is_compute = true;
 716         }
 717
 718         r600_need_cs_space(rctx, 0, true);
 719         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 720                 r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
 721                 current = rctx->cs_shader_state.shader->sel->current;
 722                 if (compute_dirty) {
 723                         rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
 724                         r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
 725                         r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
 726                 }
 727
 728                 bool need_buf_const = current->shader.uses_tex_buffers ||
 729                         current->shader.has_txq_cube_array_z_comp;
 730
 731                 if (info->indirect) {
 732                         struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
 733                         unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
 734                         unsigned offset = info->indirect_offset / 4;
 735                         indirect_grid[0] = data[offset];
 736                         indirect_grid[1] = data[offset + 1];
 737                         indirect_grid[2] = data[offset + 2];
 738                 }
 739                 for (int i = 0; i < 3; i++) {
 740                         rctx->cs_block_grid_sizes[i] = info->block[i];
 741                         rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
 742                 }
 743                 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 744                 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
 745                 if (need_buf_const) {
 746                         eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 747                 }
 748                 r600_update_driver_const_buffers(rctx, true);
 749
 750                 if (evergreen_emit_atomic_buffer_setup(rctx, current, combined_atomics, &atomic_used_mask)) {
 751                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 752                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 753                 }
 754         }
 755
 756         /* Initialize all the compute-related registers.
 757          *
 758          * See evergreen_init_atom_start_compute_cs() in this file for the list
 759          * of registers initialized by the start_compute_cs_cmd atom.
 760          */
 761         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 762
 763         /* emit config state */
 764         if (rctx->b.chip_class == EVERGREEN) {
 765                 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 766                         radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
 767                         radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
 768                         radeon_emit(cs, 0);
 769                         radeon_emit(cs, 0);
 770                         radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
 771                 } else
 772                         r600_emit_atom(rctx, &rctx->config_state.atom);
 773         }
 774
 775         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 776         r600_flush_emit(rctx);
 777
 778         if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
 779
 780                 compute_setup_cbs(rctx);
 781
 782                 /* Emit vertex buffer state */
 783                 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 784                 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 785         } else {
 786                 uint32_t rat_mask;
 787
 788                 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
 789                 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 790                                                rat_mask);
 791         }
 792
 793         r600_emit_atom(rctx, &rctx->b.render_cond_atom);
 794
 795         /* Emit constant buffer state */
 796         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 797
 798         /* Emit sampler state */
 799         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 800
 801         /* Emit sampler view (texture resource) state */
 802         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 803
 804         /* Emit images state */
 805         r600_emit_atom(rctx, &rctx->compute_images.atom);
 806
 807         /* Emit buffers state */
 808         r600_emit_atom(rctx, &rctx->compute_buffers.atom);
 809
 810         /* Emit shader state */
 811         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 812
 813         /* Emit dispatch state and dispatch packet */
 814         evergreen_emit_dispatch(rctx, info, indirect_grid);
 815
 816         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 817          */
 818         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 819                       R600_CONTEXT_INV_VERTEX_CACHE |
 820                       R600_CONTEXT_INV_TEX_CACHE;
 821         r600_flush_emit(rctx);
 822         rctx->b.flags = 0;
 823
 824         if (rctx->b.chip_class >= CAYMAN) {
 825                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 826                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 827                 /* DEALLOC_STATE prevents the GPU from hanging when a
 828                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 829                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 830                  */
 831                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 832                 radeon_emit(cs, 0);
 833         }
 834         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
 835                 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
 836
 837 #if 0
 838         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 839         for (i = 0; i < cs->cdw; i++) {
 840                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 841         }
 842 #endif
 843
 844 }
 845
 846
 847 /**
 848  * Emit function for r600_cs_shader_state atom
 849  */
 850 void evergreen_emit_cs_shader(struct r600_context *rctx,
 851                               struct r600_atom *atom)
 852 {
 853         struct r600_cs_shader_state *state =
 854                                         (struct r600_cs_shader_state*)atom;
 855         struct r600_pipe_compute *shader = state->shader;
 856         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 857         uint64_t va;
 858         struct r600_resource *code_bo;
 859         unsigned ngpr, nstack;
 860
 861         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 862                 code_bo = shader->sel->current->bo;
 863                 va = shader->sel->current->bo->gpu_address;
 864                 ngpr = shader->sel->current->shader.bc.ngpr;
 865                 nstack = shader->sel->current->shader.bc.nstack;
 866         } else {
 867                 code_bo = shader->code_bo;
 868                 va = shader->code_bo->gpu_address + state->pc;
 869                 ngpr = shader->bc.ngpr;
 870                 nstack = shader->bc.nstack;
 871         }
 872
 873         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 874         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 875         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 876                         S_0288D4_NUM_GPRS(ngpr) |
 877                         S_0288D4_DX10_CLAMP(1) |
 878                         S_0288D4_STACK_SIZE(nstack));
 879         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 880
 881         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 882         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 883                                               code_bo, RADEON_USAGE_READ,
 884                                               RADEON_PRIO_SHADER_BINARY));
 885 }
 886
 887 static void evergreen_launch_grid(struct pipe_context *ctx,
 888                                   const struct pipe_grid_info *info)
 889 {
 890         struct r600_context *rctx = (struct r600_context *)ctx;
 891 #ifdef HAVE_OPENCL
 892         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 893         boolean use_kill;
 894
 895         if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
 896                 rctx->cs_shader_state.pc = info->pc;
 897                 /* Get the config information for this kernel. */
 898                 r600_shader_binary_read_config(&shader->binary, &shader->bc,
 899                                                info->pc, &use_kill);
 900         } else {
 901                 use_kill = false;
 902                 rctx->cs_shader_state.pc = 0;
 903         }
 904 #endif
 905
 906         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 907
 908
 909         evergreen_compute_upload_input(ctx, info);
 910         compute_emit_cs(rctx, info);
 911 }
 912
 913 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 914                                             unsigned start, unsigned count,
 915                                             struct pipe_surface **surfaces)
 916 {
 917         struct r600_context *rctx = (struct r600_context *)ctx;
 918         struct r600_surface **resources = (struct r600_surface **)surfaces;
 919
 920         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 921                         start, count);
 922
 923         for (unsigned i = 0; i < count; i++) {
 924                 /* The First four vertex buffers are reserved for parameters and
 925                  * global buffers. */
 926                 unsigned vtx_id = 4 + i;
 927                 if (resources[i]) {
 928                         struct r600_resource_global *buffer =
 929                                 (struct r600_resource_global*)
 930                                 resources[i]->base.texture;
 931                         if (resources[i]->base.writable) {
 932                                 assert(i+1 < 12);
 933
 934                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 935                                 (struct r600_resource *)resources[i]->base.texture,
 936                                 buffer->chunk->start_in_dw*4,
 937                                 resources[i]->base.texture->width0);
 938                         }
 939
 940                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 941                                         buffer->chunk->start_in_dw * 4,
 942                                         resources[i]->base.texture);
 943                 }
 944         }
 945 }
 946
 947 static void evergreen_set_global_binding(struct pipe_context *ctx,
 948                                          unsigned first, unsigned n,
 949                                          struct pipe_resource **resources,
 950                                          uint32_t **handles)
 951 {
 952         struct r600_context *rctx = (struct r600_context *)ctx;
 953         struct compute_memory_pool *pool = rctx->screen->global_pool;
 954         struct r600_resource_global **buffers =
 955                 (struct r600_resource_global **)resources;
 956         unsigned i;
 957
 958         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 959                         first, n);
 960
 961         if (!resources) {
 962                 /* XXX: Unset */
 963                 return;
 964         }
 965
 966         /* We mark these items for promotion to the pool if they
 967          * aren't already there */
 968         for (i = first; i < first + n; i++) {
 969                 struct compute_memory_item *item = buffers[i]->chunk;
 970
 971                 if (!is_item_in_pool(item))
 972                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 973         }
 974
 975         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 976                 /* XXX: Unset */
 977                 return;
 978         }
 979
 980         for (i = first; i < first + n; i++)
 981         {
 982                 uint32_t buffer_offset;
 983                 uint32_t handle;
 984                 assert(resources[i]->target == PIPE_BUFFER);
 985                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 986
 987                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 988                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 989
 990                 *(handles[i]) = util_cpu_to_le32(handle);
 991         }
 992
 993         /* globals for writing */
 994         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 995         /* globals for reading */
 996         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 997                                 (struct pipe_resource*)pool->bo);
 998
 999         /* constants for reading, LLVM puts them in text segment */
1000         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1001                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1002 }
1003
1004 /**
1005  * This function initializes all the compute specific registers that need to
1006  * be initialized for each compute command stream.  Registers that are common
1007  * to both compute and 3D will be initialized at the beginning of each compute
1008  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1009  * packet requires that the shader type bit be set, we must initialize all
1010  * context registers needed for compute in this function.  The registers
1011  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1012  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1013  * on the GPU family.
1014  */
1015 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1016 {
1017         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1018         int num_threads;
1019         int num_stack_entries;
1020
1021         /* since all required registers are initialized in the
1022          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1023          */
1024         r600_init_command_buffer(cb, 256);
1025         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1026
1027         /* We're setting config registers here. */
1028         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1029         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1030
1031         switch (rctx->b.family) {
1032         case CHIP_CEDAR:
1033         default:
1034                 num_threads = 128;
1035                 num_stack_entries = 256;
1036                 break;
1037         case CHIP_REDWOOD:
1038                 num_threads = 128;
1039                 num_stack_entries = 256;
1040                 break;
1041         case CHIP_JUNIPER:
1042                 num_threads = 128;
1043                 num_stack_entries = 512;
1044                 break;
1045         case CHIP_CYPRESS:
1046         case CHIP_HEMLOCK:
1047                 num_threads = 128;
1048                 num_stack_entries = 512;
1049                 break;
1050         case CHIP_PALM:
1051                 num_threads = 128;
1052                 num_stack_entries = 256;
1053                 break;
1054         case CHIP_SUMO:
1055                 num_threads = 128;
1056                 num_stack_entries = 256;
1057                 break;
1058         case CHIP_SUMO2:
1059                 num_threads = 128;
1060                 num_stack_entries = 512;
1061                 break;
1062         case CHIP_BARTS:
1063                 num_threads = 128;
1064                 num_stack_entries = 512;
1065                 break;
1066         case CHIP_TURKS:
1067                 num_threads = 128;
1068                 num_stack_entries = 256;
1069                 break;
1070         case CHIP_CAICOS:
1071                 num_threads = 128;
1072                 num_stack_entries = 256;
1073                 break;
1074         }
1075
1076         /* The primitive type always needs to be POINTLIST for compute. */
1077         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1078                                                 V_008958_DI_PT_POINTLIST);
1079
1080         if (rctx->b.chip_class < CAYMAN) {
1081
1082                 /* These registers control which simds can be used by each stage.
1083                  * The default for these registers is 0xffffffff, which means
1084                  * all simds are available for each stage.  It's possible we may
1085                  * want to play around with these in the future, but for now
1086                  * the default value is fine.
1087                  *
1088                  * R_008E20_SQ_STATIC_THREAD_MGMT1
1089                  * R_008E24_SQ_STATIC_THREAD_MGMT2
1090                  * R_008E28_SQ_STATIC_THREAD_MGMT3
1091                  */
1092
1093                 /* XXX: We may need to adjust the thread and stack resource
1094                  * values for 3D/compute interop */
1095
1096                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1097
1098                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1099                  * Set the number of threads used by the PS/VS/GS/ES stage to
1100                  * 0.
1101                  */
1102                 r600_store_value(cb, 0);
1103
1104                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1105                  * Set the number of threads used by the CS (aka LS) stage to
1106                  * the maximum number of threads and set the number of threads
1107                  * for the HS stage to 0. */
1108                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1109
1110                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1111                  * Set the Control Flow stack entries to 0 for PS/VS stages */
1112                 r600_store_value(cb, 0);
1113
1114                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1115                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1116                 r600_store_value(cb, 0);
1117
1118                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1119                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1120                  * set it to the maximum value for the CS (aka LS) stage. */
1121                 r600_store_value(cb,
1122                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1123         }
1124         /* Give the compute shader all the available LDS space.
1125          * NOTE: This only sets the maximum number of dwords that a compute
1126          * shader can allocate.  When a shader is executed, we still need to
1127          * allocate the appropriate amount of LDS dwords using the
1128          * CM_R_0288E8_SQ_LDS_ALLOC register.
1129          */
1130         if (rctx->b.chip_class < CAYMAN) {
1131                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1132                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1133         } else {
1134                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1135                         S_0286FC_NUM_PS_LDS(0) |
1136                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1137         }
1138
1139         /* Context Registers */
1140
1141         if (rctx->b.chip_class < CAYMAN) {
1142                 /* workaround for hw issues with dyn gpr - must set all limits
1143                  * to 240 instead of 0, 0x1e == 240 / 8
1144                  */
1145                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1146                                 S_028838_PS_GPRS(0x1e) |
1147                                 S_028838_VS_GPRS(0x1e) |
1148                                 S_028838_GS_GPRS(0x1e) |
1149                                 S_028838_ES_GPRS(0x1e) |
1150                                 S_028838_HS_GPRS(0x1e) |
1151                                 S_028838_LS_GPRS(0x1e));
1152         }
1153
1154         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1155         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1156                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1157
1158         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1159
1160         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1161                                S_0286E8_TID_IN_GROUP_ENA(1) |
1162                                S_0286E8_TGID_ENA(1) |
1163                                S_0286E8_DISABLE_INDEX_PACK(1));
1164
1165         /* The LOOP_CONST registers are an optimizations for loops that allows
1166          * you to store the initial counter, increment value, and maximum
1167          * counter value in a register so that hardware can calculate the
1168          * correct number of iterations for the loop, so that you don't need
1169          * to have the loop counter in your shader code.  We don't currently use
1170          * this optimization, so we must keep track of the counter in the
1171          * shader and use a break instruction to exit loops.  However, the
1172          * hardware will still uses this register to determine when to exit a
1173          * loop, so we need to initialize the counter to 0, set the increment
1174          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1175          * is the maximum value allowed.  This gives us a maximum of 4096
1176          * iterations for our loops, but hopefully our break instruction will
1177          * execute before some time before the 4096th iteration.
1178          */
1179         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1180 }
1181
1182 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1183 {
1184         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1185         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1186         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1187 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1188         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1189         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1190         rctx->b.b.launch_grid = evergreen_launch_grid;
1191
1192 }
1193
1194 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1195                                               struct pipe_resource *resource,
1196                                               unsigned level,
1197                                               unsigned usage,
1198                                               const struct pipe_box *box,
1199                                               struct pipe_transfer **ptransfer)
1200 {
1201         struct r600_context *rctx = (struct r600_context*)ctx;
1202         struct compute_memory_pool *pool = rctx->screen->global_pool;
1203         struct r600_resource_global* buffer =
1204                 (struct r600_resource_global*)resource;
1205
1206         struct compute_memory_item *item = buffer->chunk;
1207         struct pipe_resource *dst = NULL;
1208         unsigned offset = box->x;
1209
1210         if (is_item_in_pool(item)) {
1211                 compute_memory_demote_item(pool, item, ctx);
1212         }
1213         else {
1214                 if (item->real_buffer == NULL) {
1215                         item->real_buffer =
1216                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1217                 }
1218         }
1219
1220         dst = (struct pipe_resource*)item->real_buffer;
1221
1222         if (usage & PIPE_TRANSFER_READ)
1223                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1224
1225         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1226                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1227                         "width = %u, height = %u, depth = %u)\n", level, usage,
1228                         box->x, box->y, box->z, box->width, box->height,
1229                         box->depth);
1230         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1231                 "%u (box.x)\n", item->id, box->x);
1232
1233
1234         assert(resource->target == PIPE_BUFFER);
1235         assert(resource->bind & PIPE_BIND_GLOBAL);
1236         assert(box->x >= 0);
1237         assert(box->y == 0);
1238         assert(box->z == 0);
1239
1240         ///TODO: do it better, mapping is not possible if the pool is too big
1241         return pipe_buffer_map_range(ctx, dst,
1242                         offset, box->width, usage, ptransfer);
1243 }
1244
1245 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1246                                                struct pipe_transfer *transfer)
1247 {
1248         /* struct r600_resource_global are not real resources, they just map
1249          * to an offset within the compute memory pool.  The function
1250          * r600_compute_global_transfer_map() maps the memory pool
1251          * resource rather than the struct r600_resource_global passed to
1252          * it as an argument and then initalizes ptransfer->resource with
1253          * the memory pool resource (via pipe_buffer_map_range).
1254          * When transfer_unmap is called it uses the memory pool's
1255          * vtable which calls r600_buffer_transfer_map() rather than
1256          * this function.
1257          */
1258         assert (!"This function should not be called");
1259 }
1260
1261 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1262                                                       struct pipe_transfer *transfer,
1263                                                       const struct pipe_box *box)
1264 {
1265         assert(0 && "TODO");
1266 }
1267
1268 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1269                                                struct pipe_resource *res)
1270 {
1271         struct r600_resource_global* buffer = NULL;
1272         struct r600_screen* rscreen = NULL;
1273
1274         assert(res->target == PIPE_BUFFER);
1275         assert(res->bind & PIPE_BIND_GLOBAL);
1276
1277         buffer = (struct r600_resource_global*)res;
1278         rscreen = (struct r600_screen*)screen;
1279
1280         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1281
1282         buffer->chunk = NULL;
1283         free(res);
1284 }
1285
1286 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1287 {
1288         u_default_resource_get_handle, /* get_handle */
1289         r600_compute_global_buffer_destroy, /* resource_destroy */
1290         r600_compute_global_transfer_map, /* transfer_map */
1291         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1292         r600_compute_global_transfer_unmap, /* transfer_unmap */
1293 };
1294
1295 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1296                                                         const struct pipe_resource *templ)
1297 {
1298         struct r600_resource_global* result = NULL;
1299         struct r600_screen* rscreen = NULL;
1300         int size_in_dw = 0;
1301
1302         assert(templ->target == PIPE_BUFFER);
1303         assert(templ->bind & PIPE_BIND_GLOBAL);
1304         assert(templ->array_size == 1 || templ->array_size == 0);
1305         assert(templ->depth0 == 1 || templ->depth0 == 0);
1306         assert(templ->height0 == 1 || templ->height0 == 0);
1307
1308         result = (struct r600_resource_global*)
1309         CALLOC(sizeof(struct r600_resource_global), 1);
1310         rscreen = (struct r600_screen*)screen;
1311
1312         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1313         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1314                         templ->array_size);
1315
1316         result->base.b.vtbl = &r600_global_buffer_vtbl;
1317         result->base.b.b = *templ;
1318         result->base.b.b.screen = screen;
1319         pipe_reference_init(&result->base.b.b.reference, 1);
1320
1321         size_in_dw = (templ->width0+3) / 4;
1322
1323         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1324
1325         if (result->chunk == NULL)
1326         {
1327                 free(result);
1328                 return NULL;
1329         }
1330
1331         return &result->base.b.b;
1332 }