src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "tgsi/tgsi_parse.h"
  45 #include "pipebuffer/pb_buffer.h"
  46 #include "evergreend.h"
  47 #include "r600_shader.h"
  48 #include "r600_pipe.h"
  49 #include "r600_formats.h"
  50 #include "evergreen_compute.h"
  51 #include "evergreen_compute_internal.h"
  52 #include "compute_memory_pool.h"
  53 #include "sb/sb_public.h"
  54 #include <inttypes.h>
  55
  56 /**
  57 RAT0 is for global binding write
  58 VTX1 is for global binding read
  59
  60 for wrting images RAT1...
  61 for reading images TEX2...
  62   TEX2-RAT1 is paired
  63
  64 TEX2... consumes the same fetch resources, that VTX2... would consume
  65
  66 CONST0 and VTX0 is for parameters
  67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  68   also constant cached
  69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  70   the constant cache can handle
  71
  72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  74 we should reserve another one too.=> 10 image binding for writing max.
  75
  76 from Nvidia OpenCL:
  77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  79
  80 so 10 for writing is enough. 176 is the max for reading according to the docs
  81
  82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  83 writable images will consume TEX slots, VTX slots too because of linear indexing
  84
  85 */
  86
  87 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  88                                                      unsigned size)
  89 {
  90         struct pipe_resource *buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  94                                     0, PIPE_USAGE_IMMUTABLE, size);
  95
  96         return (struct r600_resource *)buffer;
  97 }
  98
  99
 100 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 101                               unsigned id,
 102                               struct r600_resource *bo,
 103                               int start,
 104                               int size)
 105 {
 106         struct pipe_surface rat_templ;
 107         struct r600_surface *surf = NULL;
 108         struct r600_context *rctx = NULL;
 109
 110         assert(id < 12);
 111         assert((size & 3) == 0);
 112         assert((start & 0xFF) == 0);
 113
 114         rctx = pipe->ctx;
 115
 116         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 117
 118         /* Create the RAT surface */
 119         memset(&rat_templ, 0, sizeof(rat_templ));
 120         rat_templ.format = PIPE_FORMAT_R32_UINT;
 121         rat_templ.u.tex.level = 0;
 122         rat_templ.u.tex.first_layer = 0;
 123         rat_templ.u.tex.last_layer = 0;
 124
 125         /* Add the RAT the list of color buffers. Drop the old buffer first. */
 126         pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
 127         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 128                 (struct pipe_context *)pipe->ctx,
 129                 (struct pipe_resource *)bo, &rat_templ);
 130
 131         /* Update the number of color buffers */
 132         pipe->ctx->framebuffer.state.nr_cbufs =
 133                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 134
 135         /* Update the cb_target_mask
 136          * XXX: I think this is a potential spot for bugs once we start doing
 137          * GL interop.  cb_target_mask may be modified in the 3D sections
 138          * of this driver. */
 139         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 140
 141         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 142         evergreen_init_color_surface_rat(rctx, surf);
 143 }
 144
 145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 146                                            unsigned vb_index,
 147                                            unsigned offset,
 148                                            struct pipe_resource *buffer)
 149 {
 150         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 151         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 152         vb->stride = 1;
 153         vb->buffer_offset = offset;
 154         vb->buffer.resource = buffer;
 155         vb->is_user_buffer = false;
 156
 157         /* The vertex instructions in the compute shaders use the texture cache,
 158          * so we need to invalidate it. */
 159         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 160         state->enabled_mask |= 1 << vb_index;
 161         state->dirty_mask |= 1 << vb_index;
 162         r600_mark_atom_dirty(rctx, &state->atom);
 163 }
 164
 165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 166                                              unsigned cb_index,
 167                                              unsigned offset,
 168                                              unsigned size,
 169                                              struct pipe_resource *buffer)
 170 {
 171         struct pipe_constant_buffer cb;
 172         cb.buffer_size = size;
 173         cb.buffer_offset = offset;
 174         cb.buffer = buffer;
 175         cb.user_buffer = NULL;
 176
 177         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 178 }
 179
 180 /* We need to define these R600 registers here, because we can't include
 181  * evergreend.h and r600d.h.
 182  */
 183 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 184 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 185
 186 #ifdef HAVE_OPENCL
 187 static void parse_symbol_table(Elf_Data *symbol_table_data,
 188                                 const GElf_Shdr *symbol_table_header,
 189                                 struct ac_shader_binary *binary)
 190 {
 191         GElf_Sym symbol;
 192         unsigned i = 0;
 193         unsigned symbol_count =
 194                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 195
 196         /* We are over allocating this list, because symbol_count gives the
 197          * total number of symbols, and we will only be filling the list
 198          * with offsets of global symbols.  The memory savings from
 199          * allocating the correct size of this list will be small, and
 200          * I don't think it is worth the cost of pre-computing the number
 201          * of global symbols.
 202          */
 203         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 204
 205         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 206                 unsigned i;
 207                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 208                     symbol.st_shndx == 0 /* Undefined symbol */) {
 209                         continue;
 210                 }
 211
 212                 binary->global_symbol_offsets[binary->global_symbol_count] =
 213                                         symbol.st_value;
 214
 215                 /* Sort the list using bubble sort.  This list will usually
 216                  * be small. */
 217                 for (i = binary->global_symbol_count; i > 0; --i) {
 218                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 219                         uint64_t rhs = binary->global_symbol_offsets[i];
 220                         if (lhs < rhs) {
 221                                 break;
 222                         }
 223                         binary->global_symbol_offsets[i] = lhs;
 224                         binary->global_symbol_offsets[i - 1] = rhs;
 225                 }
 226                 ++binary->global_symbol_count;
 227         }
 228 }
 229
 230
 231 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 232                         unsigned symbol_sh_link,
 233                         struct ac_shader_binary *binary)
 234 {
 235         unsigned i;
 236
 237         if (!relocs || !symbols || !binary->reloc_count) {
 238                 return;
 239         }
 240         binary->relocs = CALLOC(binary->reloc_count,
 241                         sizeof(struct ac_shader_reloc));
 242         for (i = 0; i < binary->reloc_count; i++) {
 243                 GElf_Sym symbol;
 244                 GElf_Rel rel;
 245                 char *symbol_name;
 246                 struct ac_shader_reloc *reloc = &binary->relocs[i];
 247
 248                 gelf_getrel(relocs, i, &rel);
 249                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 250                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 251
 252                 reloc->offset = rel.r_offset;
 253                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 254                 reloc->name[sizeof(reloc->name)-1] = 0;
 255         }
 256 }
 257
 258 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 259                  struct ac_shader_binary *binary)
 260 {
 261         char *elf_buffer;
 262         Elf *elf;
 263         Elf_Scn *section = NULL;
 264         Elf_Data *symbols = NULL, *relocs = NULL;
 265         size_t section_str_index;
 266         unsigned symbol_sh_link = 0;
 267
 268         /* One of the libelf implementations
 269          * (http://www.mr511.de/software/english.htm) requires calling
 270          * elf_version() before elf_memory().
 271          */
 272         elf_version(EV_CURRENT);
 273         elf_buffer = MALLOC(elf_size);
 274         memcpy(elf_buffer, elf_data, elf_size);
 275
 276         elf = elf_memory(elf_buffer, elf_size);
 277
 278         elf_getshdrstrndx(elf, &section_str_index);
 279
 280         while ((section = elf_nextscn(elf, section))) {
 281                 const char *name;
 282                 Elf_Data *section_data = NULL;
 283                 GElf_Shdr section_header;
 284                 if (gelf_getshdr(section, &section_header) != &section_header) {
 285                         fprintf(stderr, "Failed to read ELF section header\n");
 286                         return;
 287                 }
 288                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 289                 if (!strcmp(name, ".text")) {
 290                         section_data = elf_getdata(section, section_data);
 291                         binary->code_size = section_data->d_size;
 292                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 293                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 294                 } else if (!strcmp(name, ".AMDGPU.config")) {
 295                         section_data = elf_getdata(section, section_data);
 296                         binary->config_size = section_data->d_size;
 297                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 298                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 299                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 300                         /* Always read disassembly if it's available. */
 301                         section_data = elf_getdata(section, section_data);
 302                         binary->disasm_string = strndup(section_data->d_buf,
 303                                                         section_data->d_size);
 304                 } else if (!strncmp(name, ".rodata", 7)) {
 305                         section_data = elf_getdata(section, section_data);
 306                         binary->rodata_size = section_data->d_size;
 307                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 308                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 309                 } else if (!strncmp(name, ".symtab", 7)) {
 310                         symbols = elf_getdata(section, section_data);
 311                         symbol_sh_link = section_header.sh_link;
 312                         parse_symbol_table(symbols, &section_header, binary);
 313                 } else if (!strcmp(name, ".rel.text")) {
 314                         relocs = elf_getdata(section, section_data);
 315                         binary->reloc_count = section_header.sh_size /
 316                                         section_header.sh_entsize;
 317                 }
 318         }
 319
 320         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 321
 322         if (elf){
 323                 elf_end(elf);
 324         }
 325         FREE(elf_buffer);
 326
 327         /* Cache the config size per symbol */
 328         if (binary->global_symbol_count) {
 329                 binary->config_size_per_symbol =
 330                         binary->config_size / binary->global_symbol_count;
 331         } else {
 332                 binary->global_symbol_count = 1;
 333                 binary->config_size_per_symbol = binary->config_size;
 334         }
 335 }
 336
 337 static const unsigned char *r600_shader_binary_config_start(
 338         const struct ac_shader_binary *binary,
 339         uint64_t symbol_offset)
 340 {
 341         unsigned i;
 342         for (i = 0; i < binary->global_symbol_count; ++i) {
 343                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 344                         unsigned offset = i * binary->config_size_per_symbol;
 345                         return binary->config + offset;
 346                 }
 347         }
 348         return binary->config;
 349 }
 350
 351 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 352                                            struct r600_bytecode *bc,
 353                                            uint64_t symbol_offset,
 354                                            boolean *use_kill)
 355 {
 356        unsigned i;
 357        const unsigned char *config =
 358                r600_shader_binary_config_start(binary, symbol_offset);
 359
 360        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 361                unsigned reg =
 362                        util_le32_to_cpu(*(uint32_t*)(config + i));
 363                unsigned value =
 364                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 365                switch (reg) {
 366                /* R600 / R700 */
 367                case R_028850_SQ_PGM_RESOURCES_PS:
 368                case R_028868_SQ_PGM_RESOURCES_VS:
 369                /* Evergreen / Northern Islands */
 370                case R_028844_SQ_PGM_RESOURCES_PS:
 371                case R_028860_SQ_PGM_RESOURCES_VS:
 372                case R_0288D4_SQ_PGM_RESOURCES_LS:
 373                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 374                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 375                        break;
 376                case R_02880C_DB_SHADER_CONTROL:
 377                        *use_kill = G_02880C_KILL_ENABLE(value);
 378                        break;
 379                case R_0288E8_SQ_LDS_ALLOC:
 380                        bc->nlds_dw = value;
 381                        break;
 382                }
 383        }
 384 }
 385
 386 static unsigned r600_create_shader(struct r600_bytecode *bc,
 387                                    const struct ac_shader_binary *binary,
 388                                    boolean *use_kill)
 389
 390 {
 391         assert(binary->code_size % 4 == 0);
 392         bc->bytecode = CALLOC(1, binary->code_size);
 393         memcpy(bc->bytecode, binary->code, binary->code_size);
 394         bc->ndw = binary->code_size / 4;
 395
 396         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 397         return 0;
 398 }
 399
 400 #endif
 401
 402 static void r600_destroy_shader(struct r600_bytecode *bc)
 403 {
 404         FREE(bc->bytecode);
 405 }
 406
 407 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 408                                             const struct pipe_compute_state *cso)
 409 {
 410         struct r600_context *rctx = (struct r600_context *)ctx;
 411         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 412 #ifdef HAVE_OPENCL
 413         const struct pipe_llvm_program_header *header;
 414         const char *code;
 415         void *p;
 416         boolean use_kill;
 417 #endif
 418
 419         shader->ctx = rctx;
 420         shader->local_size = cso->req_local_mem;
 421         shader->private_size = cso->req_private_mem;
 422         shader->input_size = cso->req_input_mem;
 423
 424         shader->ir_type = cso->ir_type;
 425
 426         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 427                 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
 428                 return shader;
 429         }
 430 #ifdef HAVE_OPENCL
 431         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 432         header = cso->prog;
 433         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 434         radeon_shader_binary_init(&shader->binary);
 435         r600_elf_read(code, header->num_bytes, &shader->binary);
 436         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 437
 438         /* Upload code + ROdata */
 439         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 440                                                         shader->bc.ndw * 4);
 441         p = r600_buffer_map_sync_with_rings(
 442                 &rctx->b, shader->code_bo,
 443                 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
 444         //TODO: use util_memcpy_cpu_to_le32 ?
 445         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 446         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 447 #endif
 448
 449         return shader;
 450 }
 451
 452 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 453 {
 454         struct r600_context *rctx = (struct r600_context *)ctx;
 455         struct r600_pipe_compute *shader = state;
 456
 457         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 458
 459         if (!shader)
 460                 return;
 461
 462         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 463                 r600_delete_shader_selector(ctx, shader->sel);
 464         } else {
 465 #ifdef HAVE_OPENCL
 466                 radeon_shader_binary_clean(&shader->binary);
 467                 pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
 468                 pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
 469 #endif
 470                 r600_destroy_shader(&shader->bc);
 471         }
 472         FREE(shader);
 473 }
 474
 475 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 476 {
 477         struct r600_context *rctx = (struct r600_context *)ctx;
 478         struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
 479         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 480
 481         if (!state) {
 482                 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 483                 return;
 484         }
 485
 486         if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
 487                 bool compute_dirty;
 488
 489                 r600_shader_select(ctx, cstate->sel, &compute_dirty);
 490         }
 491
 492         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 493 }
 494
 495 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 496  * kernel parameters there are implicit parameters that need to be stored
 497  * in the vertex buffer as well.  Here is how these parameters are organized in
 498  * the buffer:
 499  *
 500  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 501  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 502  * DWORDS 6-8: Number of work items within each work group in each dimension
 503  *             (x,y,z)
 504  * DWORDS 9+ : Kernel parameters
 505  */
 506 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 507                                            const struct pipe_grid_info *info)
 508 {
 509         struct r600_context *rctx = (struct r600_context *)ctx;
 510         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 511         unsigned i;
 512         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 513          * parameters.
 514          */
 515         unsigned input_size;
 516         uint32_t *num_work_groups_start;
 517         uint32_t *global_size_start;
 518         uint32_t *local_size_start;
 519         uint32_t *kernel_parameters_start;
 520         struct pipe_box box;
 521         struct pipe_transfer *transfer = NULL;
 522
 523         if (!shader)
 524                 return;
 525         if (shader->input_size == 0) {
 526                 return;
 527         }
 528         input_size = shader->input_size + 36;
 529         if (!shader->kernel_param) {
 530                 /* Add space for the grid dimensions */
 531                 shader->kernel_param = (struct r600_resource *)
 532                         pipe_buffer_create(ctx->screen, 0,
 533                                         PIPE_USAGE_IMMUTABLE, input_size);
 534         }
 535
 536         u_box_1d(0, input_size, &box);
 537         num_work_groups_start = ctx->transfer_map(ctx,
 538                         (struct pipe_resource*)shader->kernel_param,
 539                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 540                         &box, &transfer);
 541         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 542         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 543         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 544
 545         /* Copy the work group size */
 546         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 547
 548         /* Copy the global size */
 549         for (i = 0; i < 3; i++) {
 550                 global_size_start[i] = info->grid[i] * info->block[i];
 551         }
 552
 553         /* Copy the local dimensions */
 554         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 555
 556         /* Copy the kernel inputs */
 557         memcpy(kernel_parameters_start, info->input, shader->input_size);
 558
 559         for (i = 0; i < (input_size / 4); i++) {
 560                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 561                         ((unsigned*)num_work_groups_start)[i]);
 562         }
 563
 564         ctx->transfer_unmap(ctx, transfer);
 565
 566         /* ID=0 and ID=3 are reserved for the parameters.
 567          * LLVM will preferably use ID=0, but it does not work for dynamic
 568          * indices. */
 569         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 570                         (struct pipe_resource*)shader->kernel_param);
 571         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 572                         (struct pipe_resource*)shader->kernel_param);
 573 }
 574
 575 static void evergreen_emit_dispatch(struct r600_context *rctx,
 576                                     const struct pipe_grid_info *info,
 577                                     uint32_t indirect_grid[3])
 578 {
 579         int i;
 580         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 581         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 582         bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 583         unsigned num_waves;
 584         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 585         unsigned wave_divisor = (16 * num_pipes);
 586         int group_size = 1;
 587         int grid_size = 1;
 588         unsigned lds_size = shader->local_size / 4;
 589
 590         if (shader->ir_type != PIPE_SHADER_IR_TGSI)
 591                 lds_size += shader->bc.nlds_dw;
 592
 593         /* Calculate group_size/grid_size */
 594         for (i = 0; i < 3; i++) {
 595                 group_size *= info->block[i];
 596         }
 597
 598         for (i = 0; i < 3; i++) {
 599                 grid_size *= info->grid[i];
 600         }
 601
 602         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 603         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 604                         wave_divisor - 1) / wave_divisor;
 605
 606         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 607                                 "%u wavefronts per thread block, "
 608                                 "allocating %u dwords lds.\n",
 609                                 num_pipes, num_waves, lds_size);
 610
 611         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 612
 613         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 614         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 615         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 616         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 617
 618         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 619                                                                 group_size);
 620
 621         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 622         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 623         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 624         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 625
 626         if (rctx->b.chip_class < CAYMAN) {
 627                 assert(lds_size <= 8192);
 628         } else {
 629                 /* Cayman appears to have a slightly smaller limit, see the
 630                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 631                 assert(lds_size <= 8160);
 632         }
 633
 634         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 635                                         lds_size | (num_waves << 14));
 636
 637         if (info->indirect) {
 638                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 639                 radeon_emit(cs, indirect_grid[0]);
 640                 radeon_emit(cs, indirect_grid[1]);
 641                 radeon_emit(cs, indirect_grid[2]);
 642                 radeon_emit(cs, 1);
 643         } else {
 644                 /* Dispatch packet */
 645                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 646                 radeon_emit(cs, info->grid[0]);
 647                 radeon_emit(cs, info->grid[1]);
 648                 radeon_emit(cs, info->grid[2]);
 649                 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 650                 radeon_emit(cs, 1);
 651         }
 652
 653         if (rctx->is_debug)
 654                 eg_trace_emit(rctx);
 655 }
 656
 657 static void compute_setup_cbs(struct r600_context *rctx)
 658 {
 659         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 660         unsigned i;
 661
 662         /* Emit colorbuffers. */
 663         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 664         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 665                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 666                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 667                                                        (struct r600_resource*)cb->base.texture,
 668                                                        RADEON_USAGE_READWRITE,
 669                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 670
 671                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 672                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 673                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 674                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 675                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 676                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 677                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 678                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 679
 680                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 681                 radeon_emit(cs, reloc);
 682
 683                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 684                 radeon_emit(cs, reloc);
 685         }
 686         for (; i < 8 ; i++)
 687                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 688                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 689         for (; i < 12; i++)
 690                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 691                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 692
 693         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 694         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 695                                        rctx->compute_cb_target_mask);
 696 }
 697
 698 static void compute_emit_cs(struct r600_context *rctx,
 699                             const struct pipe_grid_info *info)
 700 {
 701         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 702         bool compute_dirty = false;
 703         struct r600_pipe_shader *current;
 704         struct r600_shader_atomic combined_atomics[8];
 705         uint8_t atomic_used_mask;
 706         uint32_t indirect_grid[3] = { 0, 0, 0 };
 707
 708         /* make sure that the gfx ring is only one active */
 709         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 710                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 711         }
 712
 713         r600_update_compressed_resource_state(rctx, true);
 714
 715         if (!rctx->cmd_buf_is_compute) {
 716                 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 717                 rctx->cmd_buf_is_compute = true;
 718         }
 719
 720         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 721                 r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
 722                 current = rctx->cs_shader_state.shader->sel->current;
 723                 if (compute_dirty) {
 724                         rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
 725                         r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
 726                         r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
 727                 }
 728
 729                 bool need_buf_const = current->shader.uses_tex_buffers ||
 730                         current->shader.has_txq_cube_array_z_comp;
 731
 732                 if (info->indirect) {
 733                         struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
 734                         unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
 735                         unsigned offset = info->indirect_offset / 4;
 736                         indirect_grid[0] = data[offset];
 737                         indirect_grid[1] = data[offset + 1];
 738                         indirect_grid[2] = data[offset + 2];
 739                 }
 740                 for (int i = 0; i < 3; i++) {
 741                         rctx->cs_block_grid_sizes[i] = info->block[i];
 742                         rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
 743                 }
 744                 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 745                 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
 746
 747                 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
 748                 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
 749
 750                 if (need_buf_const) {
 751                         eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 752                 }
 753                 r600_update_driver_const_buffers(rctx, true);
 754
 755                 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
 756                 if (atomic_used_mask) {
 757                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 758                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 759                 }
 760         } else
 761                 r600_need_cs_space(rctx, 0, true, 0);
 762
 763         /* Initialize all the compute-related registers.
 764          *
 765          * See evergreen_init_atom_start_compute_cs() in this file for the list
 766          * of registers initialized by the start_compute_cs_cmd atom.
 767          */
 768         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 769
 770         /* emit config state */
 771         if (rctx->b.chip_class == EVERGREEN) {
 772                 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 773                         radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
 774                         radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
 775                         radeon_emit(cs, 0);
 776                         radeon_emit(cs, 0);
 777                         radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
 778                 } else
 779                         r600_emit_atom(rctx, &rctx->config_state.atom);
 780         }
 781
 782         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 783         r600_flush_emit(rctx);
 784
 785         if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
 786
 787                 compute_setup_cbs(rctx);
 788
 789                 /* Emit vertex buffer state */
 790                 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 791                 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 792         } else {
 793                 uint32_t rat_mask;
 794
 795                 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
 796                 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 797                                                rat_mask);
 798         }
 799
 800         r600_emit_atom(rctx, &rctx->b.render_cond_atom);
 801
 802         /* Emit constant buffer state */
 803         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 804
 805         /* Emit sampler state */
 806         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 807
 808         /* Emit sampler view (texture resource) state */
 809         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 810
 811         /* Emit images state */
 812         r600_emit_atom(rctx, &rctx->compute_images.atom);
 813
 814         /* Emit buffers state */
 815         r600_emit_atom(rctx, &rctx->compute_buffers.atom);
 816
 817         /* Emit shader state */
 818         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 819
 820         /* Emit dispatch state and dispatch packet */
 821         evergreen_emit_dispatch(rctx, info, indirect_grid);
 822
 823         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 824          */
 825         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 826                       R600_CONTEXT_INV_VERTEX_CACHE |
 827                       R600_CONTEXT_INV_TEX_CACHE;
 828         r600_flush_emit(rctx);
 829         rctx->b.flags = 0;
 830
 831         if (rctx->b.chip_class >= CAYMAN) {
 832                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 833                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 834                 /* DEALLOC_STATE prevents the GPU from hanging when a
 835                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 836                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 837                  */
 838                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 839                 radeon_emit(cs, 0);
 840         }
 841         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
 842                 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
 843
 844 #if 0
 845         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 846         for (i = 0; i < cs->cdw; i++) {
 847                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 848         }
 849 #endif
 850
 851 }
 852
 853
 854 /**
 855  * Emit function for r600_cs_shader_state atom
 856  */
 857 void evergreen_emit_cs_shader(struct r600_context *rctx,
 858                               struct r600_atom *atom)
 859 {
 860         struct r600_cs_shader_state *state =
 861                                         (struct r600_cs_shader_state*)atom;
 862         struct r600_pipe_compute *shader = state->shader;
 863         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 864         uint64_t va;
 865         struct r600_resource *code_bo;
 866         unsigned ngpr, nstack;
 867
 868         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 869                 code_bo = shader->sel->current->bo;
 870                 va = shader->sel->current->bo->gpu_address;
 871                 ngpr = shader->sel->current->shader.bc.ngpr;
 872                 nstack = shader->sel->current->shader.bc.nstack;
 873         } else {
 874                 code_bo = shader->code_bo;
 875                 va = shader->code_bo->gpu_address + state->pc;
 876                 ngpr = shader->bc.ngpr;
 877                 nstack = shader->bc.nstack;
 878         }
 879
 880         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 881         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 882         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 883                         S_0288D4_NUM_GPRS(ngpr) |
 884                         S_0288D4_DX10_CLAMP(1) |
 885                         S_0288D4_STACK_SIZE(nstack));
 886         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 887
 888         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 889         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 890                                               code_bo, RADEON_USAGE_READ,
 891                                               RADEON_PRIO_SHADER_BINARY));
 892 }
 893
 894 static void evergreen_launch_grid(struct pipe_context *ctx,
 895                                   const struct pipe_grid_info *info)
 896 {
 897         struct r600_context *rctx = (struct r600_context *)ctx;
 898 #ifdef HAVE_OPENCL
 899         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 900         boolean use_kill;
 901
 902         if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
 903                 rctx->cs_shader_state.pc = info->pc;
 904                 /* Get the config information for this kernel. */
 905                 r600_shader_binary_read_config(&shader->binary, &shader->bc,
 906                                                info->pc, &use_kill);
 907         } else {
 908                 use_kill = false;
 909                 rctx->cs_shader_state.pc = 0;
 910         }
 911 #endif
 912
 913         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 914
 915
 916         evergreen_compute_upload_input(ctx, info);
 917         compute_emit_cs(rctx, info);
 918 }
 919
 920 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 921                                             unsigned start, unsigned count,
 922                                             struct pipe_surface **surfaces)
 923 {
 924         struct r600_context *rctx = (struct r600_context *)ctx;
 925         struct r600_surface **resources = (struct r600_surface **)surfaces;
 926
 927         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 928                         start, count);
 929
 930         for (unsigned i = 0; i < count; i++) {
 931                 /* The First four vertex buffers are reserved for parameters and
 932                  * global buffers. */
 933                 unsigned vtx_id = 4 + i;
 934                 if (resources[i]) {
 935                         struct r600_resource_global *buffer =
 936                                 (struct r600_resource_global*)
 937                                 resources[i]->base.texture;
 938                         if (resources[i]->base.writable) {
 939                                 assert(i+1 < 12);
 940
 941                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 942                                 (struct r600_resource *)resources[i]->base.texture,
 943                                 buffer->chunk->start_in_dw*4,
 944                                 resources[i]->base.texture->width0);
 945                         }
 946
 947                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 948                                         buffer->chunk->start_in_dw * 4,
 949                                         resources[i]->base.texture);
 950                 }
 951         }
 952 }
 953
 954 static void evergreen_set_global_binding(struct pipe_context *ctx,
 955                                          unsigned first, unsigned n,
 956                                          struct pipe_resource **resources,
 957                                          uint32_t **handles)
 958 {
 959         struct r600_context *rctx = (struct r600_context *)ctx;
 960         struct compute_memory_pool *pool = rctx->screen->global_pool;
 961         struct r600_resource_global **buffers =
 962                 (struct r600_resource_global **)resources;
 963         unsigned i;
 964
 965         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 966                         first, n);
 967
 968         if (!resources) {
 969                 /* XXX: Unset */
 970                 return;
 971         }
 972
 973         /* We mark these items for promotion to the pool if they
 974          * aren't already there */
 975         for (i = first; i < first + n; i++) {
 976                 struct compute_memory_item *item = buffers[i]->chunk;
 977
 978                 if (!is_item_in_pool(item))
 979                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 980         }
 981
 982         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 983                 /* XXX: Unset */
 984                 return;
 985         }
 986
 987         for (i = first; i < first + n; i++)
 988         {
 989                 uint32_t buffer_offset;
 990                 uint32_t handle;
 991                 assert(resources[i]->target == PIPE_BUFFER);
 992                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 993
 994                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 995                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 996
 997                 *(handles[i]) = util_cpu_to_le32(handle);
 998         }
 999
1000         /* globals for writing */
1001         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1002         /* globals for reading */
1003         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1004                                 (struct pipe_resource*)pool->bo);
1005
1006         /* constants for reading, LLVM puts them in text segment */
1007         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1008                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1009 }
1010
1011 /**
1012  * This function initializes all the compute specific registers that need to
1013  * be initialized for each compute command stream.  Registers that are common
1014  * to both compute and 3D will be initialized at the beginning of each compute
1015  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1016  * packet requires that the shader type bit be set, we must initialize all
1017  * context registers needed for compute in this function.  The registers
1018  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1019  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1020  * on the GPU family.
1021  */
1022 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1023 {
1024         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1025         int num_threads;
1026         int num_stack_entries;
1027
1028         /* since all required registers are initialized in the
1029          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1030          */
1031         r600_init_command_buffer(cb, 256);
1032         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1033
1034         /* We're setting config registers here. */
1035         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1036         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1037
1038         switch (rctx->b.family) {
1039         case CHIP_CEDAR:
1040         default:
1041                 num_threads = 128;
1042                 num_stack_entries = 256;
1043                 break;
1044         case CHIP_REDWOOD:
1045                 num_threads = 128;
1046                 num_stack_entries = 256;
1047                 break;
1048         case CHIP_JUNIPER:
1049                 num_threads = 128;
1050                 num_stack_entries = 512;
1051                 break;
1052         case CHIP_CYPRESS:
1053         case CHIP_HEMLOCK:
1054                 num_threads = 128;
1055                 num_stack_entries = 512;
1056                 break;
1057         case CHIP_PALM:
1058                 num_threads = 128;
1059                 num_stack_entries = 256;
1060                 break;
1061         case CHIP_SUMO:
1062                 num_threads = 128;
1063                 num_stack_entries = 256;
1064                 break;
1065         case CHIP_SUMO2:
1066                 num_threads = 128;
1067                 num_stack_entries = 512;
1068                 break;
1069         case CHIP_BARTS:
1070                 num_threads = 128;
1071                 num_stack_entries = 512;
1072                 break;
1073         case CHIP_TURKS:
1074                 num_threads = 128;
1075                 num_stack_entries = 256;
1076                 break;
1077         case CHIP_CAICOS:
1078                 num_threads = 128;
1079                 num_stack_entries = 256;
1080                 break;
1081         }
1082
1083         /* The primitive type always needs to be POINTLIST for compute. */
1084         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1085                                                 V_008958_DI_PT_POINTLIST);
1086
1087         if (rctx->b.chip_class < CAYMAN) {
1088
1089                 /* These registers control which simds can be used by each stage.
1090                  * The default for these registers is 0xffffffff, which means
1091                  * all simds are available for each stage.  It's possible we may
1092                  * want to play around with these in the future, but for now
1093                  * the default value is fine.
1094                  *
1095                  * R_008E20_SQ_STATIC_THREAD_MGMT1
1096                  * R_008E24_SQ_STATIC_THREAD_MGMT2
1097                  * R_008E28_SQ_STATIC_THREAD_MGMT3
1098                  */
1099
1100                 /* XXX: We may need to adjust the thread and stack resource
1101                  * values for 3D/compute interop */
1102
1103                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1104
1105                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1106                  * Set the number of threads used by the PS/VS/GS/ES stage to
1107                  * 0.
1108                  */
1109                 r600_store_value(cb, 0);
1110
1111                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1112                  * Set the number of threads used by the CS (aka LS) stage to
1113                  * the maximum number of threads and set the number of threads
1114                  * for the HS stage to 0. */
1115                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1116
1117                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1118                  * Set the Control Flow stack entries to 0 for PS/VS stages */
1119                 r600_store_value(cb, 0);
1120
1121                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1122                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1123                 r600_store_value(cb, 0);
1124
1125                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1126                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1127                  * set it to the maximum value for the CS (aka LS) stage. */
1128                 r600_store_value(cb,
1129                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1130         }
1131         /* Give the compute shader all the available LDS space.
1132          * NOTE: This only sets the maximum number of dwords that a compute
1133          * shader can allocate.  When a shader is executed, we still need to
1134          * allocate the appropriate amount of LDS dwords using the
1135          * CM_R_0288E8_SQ_LDS_ALLOC register.
1136          */
1137         if (rctx->b.chip_class < CAYMAN) {
1138                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1139                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1140         } else {
1141                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1142                         S_0286FC_NUM_PS_LDS(0) |
1143                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1144         }
1145
1146         /* Context Registers */
1147
1148         if (rctx->b.chip_class < CAYMAN) {
1149                 /* workaround for hw issues with dyn gpr - must set all limits
1150                  * to 240 instead of 0, 0x1e == 240 / 8
1151                  */
1152                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1153                                 S_028838_PS_GPRS(0x1e) |
1154                                 S_028838_VS_GPRS(0x1e) |
1155                                 S_028838_GS_GPRS(0x1e) |
1156                                 S_028838_ES_GPRS(0x1e) |
1157                                 S_028838_HS_GPRS(0x1e) |
1158                                 S_028838_LS_GPRS(0x1e));
1159         }
1160
1161         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1162         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1163                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1164
1165         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1166
1167         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1168                                S_0286E8_TID_IN_GROUP_ENA(1) |
1169                                S_0286E8_TGID_ENA(1) |
1170                                S_0286E8_DISABLE_INDEX_PACK(1));
1171
1172         /* The LOOP_CONST registers are an optimizations for loops that allows
1173          * you to store the initial counter, increment value, and maximum
1174          * counter value in a register so that hardware can calculate the
1175          * correct number of iterations for the loop, so that you don't need
1176          * to have the loop counter in your shader code.  We don't currently use
1177          * this optimization, so we must keep track of the counter in the
1178          * shader and use a break instruction to exit loops.  However, the
1179          * hardware will still uses this register to determine when to exit a
1180          * loop, so we need to initialize the counter to 0, set the increment
1181          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1182          * is the maximum value allowed.  This gives us a maximum of 4096
1183          * iterations for our loops, but hopefully our break instruction will
1184          * execute before some time before the 4096th iteration.
1185          */
1186         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1187 }
1188
1189 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1190 {
1191         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1192         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1193         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1194 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1195         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1196         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1197         rctx->b.b.launch_grid = evergreen_launch_grid;
1198
1199 }
1200
1201 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1202                                               struct pipe_resource *resource,
1203                                               unsigned level,
1204                                               unsigned usage,
1205                                               const struct pipe_box *box,
1206                                               struct pipe_transfer **ptransfer)
1207 {
1208         struct r600_context *rctx = (struct r600_context*)ctx;
1209         struct compute_memory_pool *pool = rctx->screen->global_pool;
1210         struct r600_resource_global* buffer =
1211                 (struct r600_resource_global*)resource;
1212
1213         struct compute_memory_item *item = buffer->chunk;
1214         struct pipe_resource *dst = NULL;
1215         unsigned offset = box->x;
1216
1217         if (is_item_in_pool(item)) {
1218                 compute_memory_demote_item(pool, item, ctx);
1219         }
1220         else {
1221                 if (item->real_buffer == NULL) {
1222                         item->real_buffer =
1223                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1224                 }
1225         }
1226
1227         dst = (struct pipe_resource*)item->real_buffer;
1228
1229         if (usage & PIPE_TRANSFER_READ)
1230                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1231
1232         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1233                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1234                         "width = %u, height = %u, depth = %u)\n", level, usage,
1235                         box->x, box->y, box->z, box->width, box->height,
1236                         box->depth);
1237         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1238                 "%u (box.x)\n", item->id, box->x);
1239
1240
1241         assert(resource->target == PIPE_BUFFER);
1242         assert(resource->bind & PIPE_BIND_GLOBAL);
1243         assert(box->x >= 0);
1244         assert(box->y == 0);
1245         assert(box->z == 0);
1246
1247         ///TODO: do it better, mapping is not possible if the pool is too big
1248         return pipe_buffer_map_range(ctx, dst,
1249                         offset, box->width, usage, ptransfer);
1250 }
1251
1252 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1253                                                struct pipe_transfer *transfer)
1254 {
1255         /* struct r600_resource_global are not real resources, they just map
1256          * to an offset within the compute memory pool.  The function
1257          * r600_compute_global_transfer_map() maps the memory pool
1258          * resource rather than the struct r600_resource_global passed to
1259          * it as an argument and then initalizes ptransfer->resource with
1260          * the memory pool resource (via pipe_buffer_map_range).
1261          * When transfer_unmap is called it uses the memory pool's
1262          * vtable which calls r600_buffer_transfer_map() rather than
1263          * this function.
1264          */
1265         assert (!"This function should not be called");
1266 }
1267
1268 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1269                                                       struct pipe_transfer *transfer,
1270                                                       const struct pipe_box *box)
1271 {
1272         assert(0 && "TODO");
1273 }
1274
1275 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1276                                                struct pipe_resource *res)
1277 {
1278         struct r600_resource_global* buffer = NULL;
1279         struct r600_screen* rscreen = NULL;
1280
1281         assert(res->target == PIPE_BUFFER);
1282         assert(res->bind & PIPE_BIND_GLOBAL);
1283
1284         buffer = (struct r600_resource_global*)res;
1285         rscreen = (struct r600_screen*)screen;
1286
1287         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1288
1289         buffer->chunk = NULL;
1290         free(res);
1291 }
1292
1293 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1294 {
1295         u_default_resource_get_handle, /* get_handle */
1296         r600_compute_global_buffer_destroy, /* resource_destroy */
1297         r600_compute_global_transfer_map, /* transfer_map */
1298         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1299         r600_compute_global_transfer_unmap, /* transfer_unmap */
1300 };
1301
1302 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1303                                                         const struct pipe_resource *templ)
1304 {
1305         struct r600_resource_global* result = NULL;
1306         struct r600_screen* rscreen = NULL;
1307         int size_in_dw = 0;
1308
1309         assert(templ->target == PIPE_BUFFER);
1310         assert(templ->bind & PIPE_BIND_GLOBAL);
1311         assert(templ->array_size == 1 || templ->array_size == 0);
1312         assert(templ->depth0 == 1 || templ->depth0 == 0);
1313         assert(templ->height0 == 1 || templ->height0 == 0);
1314
1315         result = (struct r600_resource_global*)
1316         CALLOC(sizeof(struct r600_resource_global), 1);
1317         rscreen = (struct r600_screen*)screen;
1318
1319         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1320         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1321                         templ->array_size);
1322
1323         result->base.b.vtbl = &r600_global_buffer_vtbl;
1324         result->base.b.b = *templ;
1325         result->base.b.b.screen = screen;
1326         pipe_reference_init(&result->base.b.b.reference, 1);
1327
1328         size_in_dw = (templ->width0+3) / 4;
1329
1330         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1331
1332         if (result->chunk == NULL)
1333         {
1334                 free(result);
1335                 return NULL;
1336         }
1337
1338         return &result->base.b.b;
1339 }