src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "tgsi/tgsi_parse.h"
  45 #include "pipebuffer/pb_buffer.h"
  46 #include "evergreend.h"
  47 #include "r600_shader.h"
  48 #include "r600_pipe.h"
  49 #include "r600_formats.h"
  50 #include "evergreen_compute.h"
  51 #include "evergreen_compute_internal.h"
  52 #include "compute_memory_pool.h"
  53 #include "sb/sb_public.h"
  54 #include <inttypes.h>
  55
  56 /**
  57 RAT0 is for global binding write
  58 VTX1 is for global binding read
  59
  60 for wrting images RAT1...
  61 for reading images TEX2...
  62   TEX2-RAT1 is paired
  63
  64 TEX2... consumes the same fetch resources, that VTX2... would consume
  65
  66 CONST0 and VTX0 is for parameters
  67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  68   also constant cached
  69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  70   the constant cache can handle
  71
  72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  74 we should reserve another one too.=> 10 image binding for writing max.
  75
  76 from Nvidia OpenCL:
  77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  79
  80 so 10 for writing is enough. 176 is the max for reading according to the docs
  81
  82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  83 writable images will consume TEX slots, VTX slots too because of linear indexing
  84
  85 */
  86
  87 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  88                                                      unsigned size)
  89 {
  90         struct pipe_resource *buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  94                                     0, PIPE_USAGE_IMMUTABLE, size);
  95
  96         return (struct r600_resource *)buffer;
  97 }
  98
  99
 100 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 101                               unsigned id,
 102                               struct r600_resource *bo,
 103                               int start,
 104                               int size)
 105 {
 106         struct pipe_surface rat_templ;
 107         struct r600_surface *surf = NULL;
 108         struct r600_context *rctx = NULL;
 109
 110         assert(id < 12);
 111         assert((size & 3) == 0);
 112         assert((start & 0xFF) == 0);
 113
 114         rctx = pipe->ctx;
 115
 116         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 117
 118         /* Create the RAT surface */
 119         memset(&rat_templ, 0, sizeof(rat_templ));
 120         rat_templ.format = PIPE_FORMAT_R32_UINT;
 121         rat_templ.u.tex.level = 0;
 122         rat_templ.u.tex.first_layer = 0;
 123         rat_templ.u.tex.last_layer = 0;
 124
 125         /* Add the RAT the list of color buffers. Drop the old buffer first. */
 126         pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
 127         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 128                 (struct pipe_context *)pipe->ctx,
 129                 (struct pipe_resource *)bo, &rat_templ);
 130
 131         /* Update the number of color buffers */
 132         pipe->ctx->framebuffer.state.nr_cbufs =
 133                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 134
 135         /* Update the cb_target_mask
 136          * XXX: I think this is a potential spot for bugs once we start doing
 137          * GL interop.  cb_target_mask may be modified in the 3D sections
 138          * of this driver. */
 139         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 140
 141         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 142         evergreen_init_color_surface_rat(rctx, surf);
 143 }
 144
 145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 146                                            unsigned vb_index,
 147                                            unsigned offset,
 148                                            struct pipe_resource *buffer)
 149 {
 150         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 151         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 152         vb->stride = 1;
 153         vb->buffer_offset = offset;
 154         vb->buffer.resource = buffer;
 155         vb->is_user_buffer = false;
 156
 157         /* The vertex instructions in the compute shaders use the texture cache,
 158          * so we need to invalidate it. */
 159         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 160         state->enabled_mask |= 1 << vb_index;
 161         state->dirty_mask |= 1 << vb_index;
 162         r600_mark_atom_dirty(rctx, &state->atom);
 163 }
 164
 165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 166                                              unsigned cb_index,
 167                                              unsigned offset,
 168                                              unsigned size,
 169                                              struct pipe_resource *buffer)
 170 {
 171         struct pipe_constant_buffer cb;
 172         cb.buffer_size = size;
 173         cb.buffer_offset = offset;
 174         cb.buffer = buffer;
 175         cb.user_buffer = NULL;
 176
 177         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 178 }
 179
 180 /* We need to define these R600 registers here, because we can't include
 181  * evergreend.h and r600d.h.
 182  */
 183 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 184 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 185
 186 #ifdef HAVE_OPENCL
 187 static void parse_symbol_table(Elf_Data *symbol_table_data,
 188                                 const GElf_Shdr *symbol_table_header,
 189                                 struct ac_shader_binary *binary)
 190 {
 191         GElf_Sym symbol;
 192         unsigned i = 0;
 193         unsigned symbol_count =
 194                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 195
 196         /* We are over allocating this list, because symbol_count gives the
 197          * total number of symbols, and we will only be filling the list
 198          * with offsets of global symbols.  The memory savings from
 199          * allocating the correct size of this list will be small, and
 200          * I don't think it is worth the cost of pre-computing the number
 201          * of global symbols.
 202          */
 203         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 204
 205         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 206                 unsigned i;
 207                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 208                     symbol.st_shndx == 0 /* Undefined symbol */) {
 209                         continue;
 210                 }
 211
 212                 binary->global_symbol_offsets[binary->global_symbol_count] =
 213                                         symbol.st_value;
 214
 215                 /* Sort the list using bubble sort.  This list will usually
 216                  * be small. */
 217                 for (i = binary->global_symbol_count; i > 0; --i) {
 218                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 219                         uint64_t rhs = binary->global_symbol_offsets[i];
 220                         if (lhs < rhs) {
 221                                 break;
 222                         }
 223                         binary->global_symbol_offsets[i] = lhs;
 224                         binary->global_symbol_offsets[i - 1] = rhs;
 225                 }
 226                 ++binary->global_symbol_count;
 227         }
 228 }
 229
 230
 231 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 232                         unsigned symbol_sh_link,
 233                         struct ac_shader_binary *binary)
 234 {
 235         unsigned i;
 236
 237         if (!relocs || !symbols || !binary->reloc_count) {
 238                 return;
 239         }
 240         binary->relocs = CALLOC(binary->reloc_count,
 241                         sizeof(struct ac_shader_reloc));
 242         for (i = 0; i < binary->reloc_count; i++) {
 243                 GElf_Sym symbol;
 244                 GElf_Rel rel;
 245                 char *symbol_name;
 246                 struct ac_shader_reloc *reloc = &binary->relocs[i];
 247
 248                 gelf_getrel(relocs, i, &rel);
 249                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 250                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 251
 252                 reloc->offset = rel.r_offset;
 253                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 254                 reloc->name[sizeof(reloc->name)-1] = 0;
 255         }
 256 }
 257
 258 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 259                  struct ac_shader_binary *binary)
 260 {
 261         char *elf_buffer;
 262         Elf *elf;
 263         Elf_Scn *section = NULL;
 264         Elf_Data *symbols = NULL, *relocs = NULL;
 265         size_t section_str_index;
 266         unsigned symbol_sh_link = 0;
 267
 268         /* One of the libelf implementations
 269          * (http://www.mr511.de/software/english.htm) requires calling
 270          * elf_version() before elf_memory().
 271          */
 272         elf_version(EV_CURRENT);
 273         elf_buffer = MALLOC(elf_size);
 274         memcpy(elf_buffer, elf_data, elf_size);
 275
 276         elf = elf_memory(elf_buffer, elf_size);
 277
 278         elf_getshdrstrndx(elf, &section_str_index);
 279
 280         while ((section = elf_nextscn(elf, section))) {
 281                 const char *name;
 282                 Elf_Data *section_data = NULL;
 283                 GElf_Shdr section_header;
 284                 if (gelf_getshdr(section, &section_header) != &section_header) {
 285                         fprintf(stderr, "Failed to read ELF section header\n");
 286                         return;
 287                 }
 288                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 289                 if (!strcmp(name, ".text")) {
 290                         section_data = elf_getdata(section, section_data);
 291                         binary->code_size = section_data->d_size;
 292                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 293                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 294                 } else if (!strcmp(name, ".AMDGPU.config")) {
 295                         section_data = elf_getdata(section, section_data);
 296                         binary->config_size = section_data->d_size;
 297                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 298                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 299                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 300                         /* Always read disassembly if it's available. */
 301                         section_data = elf_getdata(section, section_data);
 302                         binary->disasm_string = strndup(section_data->d_buf,
 303                                                         section_data->d_size);
 304                 } else if (!strncmp(name, ".rodata", 7)) {
 305                         section_data = elf_getdata(section, section_data);
 306                         binary->rodata_size = section_data->d_size;
 307                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 308                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 309                 } else if (!strncmp(name, ".symtab", 7)) {
 310                         symbols = elf_getdata(section, section_data);
 311                         symbol_sh_link = section_header.sh_link;
 312                         parse_symbol_table(symbols, &section_header, binary);
 313                 } else if (!strcmp(name, ".rel.text")) {
 314                         relocs = elf_getdata(section, section_data);
 315                         binary->reloc_count = section_header.sh_size /
 316                                         section_header.sh_entsize;
 317                 }
 318         }
 319
 320         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 321
 322         if (elf){
 323                 elf_end(elf);
 324         }
 325         FREE(elf_buffer);
 326
 327         /* Cache the config size per symbol */
 328         if (binary->global_symbol_count) {
 329                 binary->config_size_per_symbol =
 330                         binary->config_size / binary->global_symbol_count;
 331         } else {
 332                 binary->global_symbol_count = 1;
 333                 binary->config_size_per_symbol = binary->config_size;
 334         }
 335 }
 336
 337 static const unsigned char *r600_shader_binary_config_start(
 338         const struct ac_shader_binary *binary,
 339         uint64_t symbol_offset)
 340 {
 341         unsigned i;
 342         for (i = 0; i < binary->global_symbol_count; ++i) {
 343                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 344                         unsigned offset = i * binary->config_size_per_symbol;
 345                         return binary->config + offset;
 346                 }
 347         }
 348         return binary->config;
 349 }
 350
 351 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 352                                            struct r600_bytecode *bc,
 353                                            uint64_t symbol_offset,
 354                                            boolean *use_kill)
 355 {
 356        unsigned i;
 357        const unsigned char *config =
 358                r600_shader_binary_config_start(binary, symbol_offset);
 359
 360        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 361                unsigned reg =
 362                        util_le32_to_cpu(*(uint32_t*)(config + i));
 363                unsigned value =
 364                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 365                switch (reg) {
 366                /* R600 / R700 */
 367                case R_028850_SQ_PGM_RESOURCES_PS:
 368                case R_028868_SQ_PGM_RESOURCES_VS:
 369                /* Evergreen / Northern Islands */
 370                case R_028844_SQ_PGM_RESOURCES_PS:
 371                case R_028860_SQ_PGM_RESOURCES_VS:
 372                case R_0288D4_SQ_PGM_RESOURCES_LS:
 373                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 374                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 375                        break;
 376                case R_02880C_DB_SHADER_CONTROL:
 377                        *use_kill = G_02880C_KILL_ENABLE(value);
 378                        break;
 379                case R_0288E8_SQ_LDS_ALLOC:
 380                        bc->nlds_dw = value;
 381                        break;
 382                }
 383        }
 384 }
 385
 386 static unsigned r600_create_shader(struct r600_bytecode *bc,
 387                                    const struct ac_shader_binary *binary,
 388                                    boolean *use_kill)
 389
 390 {
 391         assert(binary->code_size % 4 == 0);
 392         bc->bytecode = CALLOC(1, binary->code_size);
 393         memcpy(bc->bytecode, binary->code, binary->code_size);
 394         bc->ndw = binary->code_size / 4;
 395
 396         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 397         return 0;
 398 }
 399
 400 #endif
 401
 402 static void r600_destroy_shader(struct r600_bytecode *bc)
 403 {
 404         FREE(bc->bytecode);
 405 }
 406
 407 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 408                                             const struct pipe_compute_state *cso)
 409 {
 410         struct r600_context *rctx = (struct r600_context *)ctx;
 411         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 412 #ifdef HAVE_OPENCL
 413         const struct pipe_llvm_program_header *header;
 414         const char *code;
 415         void *p;
 416         boolean use_kill;
 417 #endif
 418
 419         shader->ctx = rctx;
 420         shader->local_size = cso->req_local_mem;
 421         shader->private_size = cso->req_private_mem;
 422         shader->input_size = cso->req_input_mem;
 423
 424         shader->ir_type = cso->ir_type;
 425
 426         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 427                 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
 428                 return shader;
 429         }
 430 #ifdef HAVE_OPENCL
 431         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 432         header = cso->prog;
 433         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 434         radeon_shader_binary_init(&shader->binary);
 435         r600_elf_read(code, header->num_bytes, &shader->binary);
 436         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 437
 438         /* Upload code + ROdata */
 439         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 440                                                         shader->bc.ndw * 4);
 441         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 442         //TODO: use util_memcpy_cpu_to_le32 ?
 443         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 444         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 445 #endif
 446
 447         return shader;
 448 }
 449
 450 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 451 {
 452         struct r600_context *rctx = (struct r600_context *)ctx;
 453         struct r600_pipe_compute *shader = state;
 454
 455         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 456
 457         if (!shader)
 458                 return;
 459
 460         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 461                 r600_delete_shader_selector(ctx, shader->sel);
 462         } else {
 463 #ifdef HAVE_OPENCL
 464                 radeon_shader_binary_clean(&shader->binary);
 465                 pipe_resource_reference(&shader->code_bo, NULL);
 466                 pipe_resource_reference(&shader->kernel_param, NULL);
 467 #endif
 468                 r600_destroy_shader(&shader->bc);
 469         }
 470         FREE(shader);
 471 }
 472
 473 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 474 {
 475         struct r600_context *rctx = (struct r600_context *)ctx;
 476         struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
 477         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 478
 479         if (!state) {
 480                 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 481                 return;
 482         }
 483
 484         if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
 485                 bool compute_dirty;
 486
 487                 r600_shader_select(ctx, cstate->sel, &compute_dirty);
 488         }
 489
 490         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 491 }
 492
 493 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 494  * kernel parameters there are implicit parameters that need to be stored
 495  * in the vertex buffer as well.  Here is how these parameters are organized in
 496  * the buffer:
 497  *
 498  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 499  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 500  * DWORDS 6-8: Number of work items within each work group in each dimension
 501  *             (x,y,z)
 502  * DWORDS 9+ : Kernel parameters
 503  */
 504 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 505                                            const struct pipe_grid_info *info)
 506 {
 507         struct r600_context *rctx = (struct r600_context *)ctx;
 508         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 509         unsigned i;
 510         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 511          * parameters.
 512          */
 513         unsigned input_size;
 514         uint32_t *num_work_groups_start;
 515         uint32_t *global_size_start;
 516         uint32_t *local_size_start;
 517         uint32_t *kernel_parameters_start;
 518         struct pipe_box box;
 519         struct pipe_transfer *transfer = NULL;
 520
 521         if (!shader)
 522                 return;
 523         if (shader->input_size == 0) {
 524                 return;
 525         }
 526         input_size = shader->input_size + 36;
 527         if (!shader->kernel_param) {
 528                 /* Add space for the grid dimensions */
 529                 shader->kernel_param = (struct r600_resource *)
 530                         pipe_buffer_create(ctx->screen, 0,
 531                                         PIPE_USAGE_IMMUTABLE, input_size);
 532         }
 533
 534         u_box_1d(0, input_size, &box);
 535         num_work_groups_start = ctx->transfer_map(ctx,
 536                         (struct pipe_resource*)shader->kernel_param,
 537                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 538                         &box, &transfer);
 539         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 540         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 541         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 542
 543         /* Copy the work group size */
 544         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 545
 546         /* Copy the global size */
 547         for (i = 0; i < 3; i++) {
 548                 global_size_start[i] = info->grid[i] * info->block[i];
 549         }
 550
 551         /* Copy the local dimensions */
 552         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 553
 554         /* Copy the kernel inputs */
 555         memcpy(kernel_parameters_start, info->input, shader->input_size);
 556
 557         for (i = 0; i < (input_size / 4); i++) {
 558                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 559                         ((unsigned*)num_work_groups_start)[i]);
 560         }
 561
 562         ctx->transfer_unmap(ctx, transfer);
 563
 564         /* ID=0 and ID=3 are reserved for the parameters.
 565          * LLVM will preferably use ID=0, but it does not work for dynamic
 566          * indices. */
 567         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 568                         (struct pipe_resource*)shader->kernel_param);
 569         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 570                         (struct pipe_resource*)shader->kernel_param);
 571 }
 572
 573 static void evergreen_emit_dispatch(struct r600_context *rctx,
 574                                     const struct pipe_grid_info *info,
 575                                     uint32_t indirect_grid[3])
 576 {
 577         int i;
 578         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 579         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 580         bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 581         unsigned num_waves;
 582         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 583         unsigned wave_divisor = (16 * num_pipes);
 584         int group_size = 1;
 585         int grid_size = 1;
 586         unsigned lds_size = shader->local_size / 4;
 587
 588         if (shader->ir_type != PIPE_SHADER_IR_TGSI)
 589                 lds_size += shader->bc.nlds_dw;
 590
 591         /* Calculate group_size/grid_size */
 592         for (i = 0; i < 3; i++) {
 593                 group_size *= info->block[i];
 594         }
 595
 596         for (i = 0; i < 3; i++) {
 597                 grid_size *= info->grid[i];
 598         }
 599
 600         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 601         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 602                         wave_divisor - 1) / wave_divisor;
 603
 604         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 605                                 "%u wavefronts per thread block, "
 606                                 "allocating %u dwords lds.\n",
 607                                 num_pipes, num_waves, lds_size);
 608
 609         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 610
 611         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 612         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 613         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 614         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 615
 616         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 617                                                                 group_size);
 618
 619         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 620         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 621         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 622         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 623
 624         if (rctx->b.chip_class < CAYMAN) {
 625                 assert(lds_size <= 8192);
 626         } else {
 627                 /* Cayman appears to have a slightly smaller limit, see the
 628                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 629                 assert(lds_size <= 8160);
 630         }
 631
 632         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 633                                         lds_size | (num_waves << 14));
 634
 635         if (info->indirect) {
 636                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 637                 radeon_emit(cs, indirect_grid[0]);
 638                 radeon_emit(cs, indirect_grid[1]);
 639                 radeon_emit(cs, indirect_grid[2]);
 640                 radeon_emit(cs, 1);
 641         } else {
 642                 /* Dispatch packet */
 643                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 644                 radeon_emit(cs, info->grid[0]);
 645                 radeon_emit(cs, info->grid[1]);
 646                 radeon_emit(cs, info->grid[2]);
 647                 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 648                 radeon_emit(cs, 1);
 649         }
 650
 651         if (rctx->is_debug)
 652                 eg_trace_emit(rctx);
 653 }
 654
 655 static void compute_setup_cbs(struct r600_context *rctx)
 656 {
 657         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 658         unsigned i;
 659
 660         /* Emit colorbuffers. */
 661         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 662         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 663                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 664                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 665                                                        (struct r600_resource*)cb->base.texture,
 666                                                        RADEON_USAGE_READWRITE,
 667                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 668
 669                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 670                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 671                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 672                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 673                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 674                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 675                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 676                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 677
 678                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 679                 radeon_emit(cs, reloc);
 680
 681                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 682                 radeon_emit(cs, reloc);
 683         }
 684         for (; i < 8 ; i++)
 685                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 686                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 687         for (; i < 12; i++)
 688                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 689                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 690
 691         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 692         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 693                                        rctx->compute_cb_target_mask);
 694 }
 695
 696 static void compute_emit_cs(struct r600_context *rctx,
 697                             const struct pipe_grid_info *info)
 698 {
 699         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 700         bool compute_dirty = false;
 701         struct r600_pipe_shader *current;
 702         struct r600_shader_atomic combined_atomics[8];
 703         uint8_t atomic_used_mask;
 704         uint32_t indirect_grid[3] = { 0, 0, 0 };
 705
 706         /* make sure that the gfx ring is only one active */
 707         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 708                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 709         }
 710
 711         r600_update_compressed_resource_state(rctx, true);
 712
 713         if (!rctx->cmd_buf_is_compute) {
 714                 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 715                 rctx->cmd_buf_is_compute = true;
 716         }
 717
 718         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 719                 r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
 720                 current = rctx->cs_shader_state.shader->sel->current;
 721                 if (compute_dirty) {
 722                         rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
 723                         r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
 724                         r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
 725                 }
 726
 727                 bool need_buf_const = current->shader.uses_tex_buffers ||
 728                         current->shader.has_txq_cube_array_z_comp;
 729
 730                 if (info->indirect) {
 731                         struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
 732                         unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
 733                         unsigned offset = info->indirect_offset / 4;
 734                         indirect_grid[0] = data[offset];
 735                         indirect_grid[1] = data[offset + 1];
 736                         indirect_grid[2] = data[offset + 2];
 737                 }
 738                 for (int i = 0; i < 3; i++) {
 739                         rctx->cs_block_grid_sizes[i] = info->block[i];
 740                         rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
 741                 }
 742                 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 743                 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
 744
 745                 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
 746                 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
 747
 748                 if (need_buf_const) {
 749                         eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 750                 }
 751                 r600_update_driver_const_buffers(rctx, true);
 752
 753                 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
 754                 if (atomic_used_mask) {
 755                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 756                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 757                 }
 758         } else
 759                 r600_need_cs_space(rctx, 0, true, 0);
 760
 761         /* Initialize all the compute-related registers.
 762          *
 763          * See evergreen_init_atom_start_compute_cs() in this file for the list
 764          * of registers initialized by the start_compute_cs_cmd atom.
 765          */
 766         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 767
 768         /* emit config state */
 769         if (rctx->b.chip_class == EVERGREEN) {
 770                 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 771                         radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
 772                         radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
 773                         radeon_emit(cs, 0);
 774                         radeon_emit(cs, 0);
 775                         radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
 776                 } else
 777                         r600_emit_atom(rctx, &rctx->config_state.atom);
 778         }
 779
 780         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 781         r600_flush_emit(rctx);
 782
 783         if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
 784
 785                 compute_setup_cbs(rctx);
 786
 787                 /* Emit vertex buffer state */
 788                 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 789                 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 790         } else {
 791                 uint32_t rat_mask;
 792
 793                 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
 794                 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 795                                                rat_mask);
 796         }
 797
 798         r600_emit_atom(rctx, &rctx->b.render_cond_atom);
 799
 800         /* Emit constant buffer state */
 801         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 802
 803         /* Emit sampler state */
 804         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 805
 806         /* Emit sampler view (texture resource) state */
 807         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 808
 809         /* Emit images state */
 810         r600_emit_atom(rctx, &rctx->compute_images.atom);
 811
 812         /* Emit buffers state */
 813         r600_emit_atom(rctx, &rctx->compute_buffers.atom);
 814
 815         /* Emit shader state */
 816         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 817
 818         /* Emit dispatch state and dispatch packet */
 819         evergreen_emit_dispatch(rctx, info, indirect_grid);
 820
 821         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 822          */
 823         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 824                       R600_CONTEXT_INV_VERTEX_CACHE |
 825                       R600_CONTEXT_INV_TEX_CACHE;
 826         r600_flush_emit(rctx);
 827         rctx->b.flags = 0;
 828
 829         if (rctx->b.chip_class >= CAYMAN) {
 830                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 831                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 832                 /* DEALLOC_STATE prevents the GPU from hanging when a
 833                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 834                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 835                  */
 836                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 837                 radeon_emit(cs, 0);
 838         }
 839         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
 840                 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
 841
 842 #if 0
 843         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 844         for (i = 0; i < cs->cdw; i++) {
 845                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 846         }
 847 #endif
 848
 849 }
 850
 851
 852 /**
 853  * Emit function for r600_cs_shader_state atom
 854  */
 855 void evergreen_emit_cs_shader(struct r600_context *rctx,
 856                               struct r600_atom *atom)
 857 {
 858         struct r600_cs_shader_state *state =
 859                                         (struct r600_cs_shader_state*)atom;
 860         struct r600_pipe_compute *shader = state->shader;
 861         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 862         uint64_t va;
 863         struct r600_resource *code_bo;
 864         unsigned ngpr, nstack;
 865
 866         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 867                 code_bo = shader->sel->current->bo;
 868                 va = shader->sel->current->bo->gpu_address;
 869                 ngpr = shader->sel->current->shader.bc.ngpr;
 870                 nstack = shader->sel->current->shader.bc.nstack;
 871         } else {
 872                 code_bo = shader->code_bo;
 873                 va = shader->code_bo->gpu_address + state->pc;
 874                 ngpr = shader->bc.ngpr;
 875                 nstack = shader->bc.nstack;
 876         }
 877
 878         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 879         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 880         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 881                         S_0288D4_NUM_GPRS(ngpr) |
 882                         S_0288D4_DX10_CLAMP(1) |
 883                         S_0288D4_STACK_SIZE(nstack));
 884         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 885
 886         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 887         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 888                                               code_bo, RADEON_USAGE_READ,
 889                                               RADEON_PRIO_SHADER_BINARY));
 890 }
 891
 892 static void evergreen_launch_grid(struct pipe_context *ctx,
 893                                   const struct pipe_grid_info *info)
 894 {
 895         struct r600_context *rctx = (struct r600_context *)ctx;
 896 #ifdef HAVE_OPENCL
 897         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 898         boolean use_kill;
 899
 900         if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
 901                 rctx->cs_shader_state.pc = info->pc;
 902                 /* Get the config information for this kernel. */
 903                 r600_shader_binary_read_config(&shader->binary, &shader->bc,
 904                                                info->pc, &use_kill);
 905         } else {
 906                 use_kill = false;
 907                 rctx->cs_shader_state.pc = 0;
 908         }
 909 #endif
 910
 911         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 912
 913
 914         evergreen_compute_upload_input(ctx, info);
 915         compute_emit_cs(rctx, info);
 916 }
 917
 918 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 919                                             unsigned start, unsigned count,
 920                                             struct pipe_surface **surfaces)
 921 {
 922         struct r600_context *rctx = (struct r600_context *)ctx;
 923         struct r600_surface **resources = (struct r600_surface **)surfaces;
 924
 925         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 926                         start, count);
 927
 928         for (unsigned i = 0; i < count; i++) {
 929                 /* The First four vertex buffers are reserved for parameters and
 930                  * global buffers. */
 931                 unsigned vtx_id = 4 + i;
 932                 if (resources[i]) {
 933                         struct r600_resource_global *buffer =
 934                                 (struct r600_resource_global*)
 935                                 resources[i]->base.texture;
 936                         if (resources[i]->base.writable) {
 937                                 assert(i+1 < 12);
 938
 939                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 940                                 (struct r600_resource *)resources[i]->base.texture,
 941                                 buffer->chunk->start_in_dw*4,
 942                                 resources[i]->base.texture->width0);
 943                         }
 944
 945                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 946                                         buffer->chunk->start_in_dw * 4,
 947                                         resources[i]->base.texture);
 948                 }
 949         }
 950 }
 951
 952 static void evergreen_set_global_binding(struct pipe_context *ctx,
 953                                          unsigned first, unsigned n,
 954                                          struct pipe_resource **resources,
 955                                          uint32_t **handles)
 956 {
 957         struct r600_context *rctx = (struct r600_context *)ctx;
 958         struct compute_memory_pool *pool = rctx->screen->global_pool;
 959         struct r600_resource_global **buffers =
 960                 (struct r600_resource_global **)resources;
 961         unsigned i;
 962
 963         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 964                         first, n);
 965
 966         if (!resources) {
 967                 /* XXX: Unset */
 968                 return;
 969         }
 970
 971         /* We mark these items for promotion to the pool if they
 972          * aren't already there */
 973         for (i = first; i < first + n; i++) {
 974                 struct compute_memory_item *item = buffers[i]->chunk;
 975
 976                 if (!is_item_in_pool(item))
 977                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 978         }
 979
 980         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 981                 /* XXX: Unset */
 982                 return;
 983         }
 984
 985         for (i = first; i < first + n; i++)
 986         {
 987                 uint32_t buffer_offset;
 988                 uint32_t handle;
 989                 assert(resources[i]->target == PIPE_BUFFER);
 990                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 991
 992                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 993                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 994
 995                 *(handles[i]) = util_cpu_to_le32(handle);
 996         }
 997
 998         /* globals for writing */
 999         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1000         /* globals for reading */
1001         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1002                                 (struct pipe_resource*)pool->bo);
1003
1004         /* constants for reading, LLVM puts them in text segment */
1005         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1006                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1007 }
1008
1009 /**
1010  * This function initializes all the compute specific registers that need to
1011  * be initialized for each compute command stream.  Registers that are common
1012  * to both compute and 3D will be initialized at the beginning of each compute
1013  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1014  * packet requires that the shader type bit be set, we must initialize all
1015  * context registers needed for compute in this function.  The registers
1016  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1017  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1018  * on the GPU family.
1019  */
1020 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1021 {
1022         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1023         int num_threads;
1024         int num_stack_entries;
1025
1026         /* since all required registers are initialized in the
1027          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1028          */
1029         r600_init_command_buffer(cb, 256);
1030         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1031
1032         /* We're setting config registers here. */
1033         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1034         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1035
1036         switch (rctx->b.family) {
1037         case CHIP_CEDAR:
1038         default:
1039                 num_threads = 128;
1040                 num_stack_entries = 256;
1041                 break;
1042         case CHIP_REDWOOD:
1043                 num_threads = 128;
1044                 num_stack_entries = 256;
1045                 break;
1046         case CHIP_JUNIPER:
1047                 num_threads = 128;
1048                 num_stack_entries = 512;
1049                 break;
1050         case CHIP_CYPRESS:
1051         case CHIP_HEMLOCK:
1052                 num_threads = 128;
1053                 num_stack_entries = 512;
1054                 break;
1055         case CHIP_PALM:
1056                 num_threads = 128;
1057                 num_stack_entries = 256;
1058                 break;
1059         case CHIP_SUMO:
1060                 num_threads = 128;
1061                 num_stack_entries = 256;
1062                 break;
1063         case CHIP_SUMO2:
1064                 num_threads = 128;
1065                 num_stack_entries = 512;
1066                 break;
1067         case CHIP_BARTS:
1068                 num_threads = 128;
1069                 num_stack_entries = 512;
1070                 break;
1071         case CHIP_TURKS:
1072                 num_threads = 128;
1073                 num_stack_entries = 256;
1074                 break;
1075         case CHIP_CAICOS:
1076                 num_threads = 128;
1077                 num_stack_entries = 256;
1078                 break;
1079         }
1080
1081         /* The primitive type always needs to be POINTLIST for compute. */
1082         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1083                                                 V_008958_DI_PT_POINTLIST);
1084
1085         if (rctx->b.chip_class < CAYMAN) {
1086
1087                 /* These registers control which simds can be used by each stage.
1088                  * The default for these registers is 0xffffffff, which means
1089                  * all simds are available for each stage.  It's possible we may
1090                  * want to play around with these in the future, but for now
1091                  * the default value is fine.
1092                  *
1093                  * R_008E20_SQ_STATIC_THREAD_MGMT1
1094                  * R_008E24_SQ_STATIC_THREAD_MGMT2
1095                  * R_008E28_SQ_STATIC_THREAD_MGMT3
1096                  */
1097
1098                 /* XXX: We may need to adjust the thread and stack resource
1099                  * values for 3D/compute interop */
1100
1101                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1102
1103                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1104                  * Set the number of threads used by the PS/VS/GS/ES stage to
1105                  * 0.
1106                  */
1107                 r600_store_value(cb, 0);
1108
1109                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1110                  * Set the number of threads used by the CS (aka LS) stage to
1111                  * the maximum number of threads and set the number of threads
1112                  * for the HS stage to 0. */
1113                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1114
1115                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1116                  * Set the Control Flow stack entries to 0 for PS/VS stages */
1117                 r600_store_value(cb, 0);
1118
1119                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1120                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1121                 r600_store_value(cb, 0);
1122
1123                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1124                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1125                  * set it to the maximum value for the CS (aka LS) stage. */
1126                 r600_store_value(cb,
1127                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1128         }
1129         /* Give the compute shader all the available LDS space.
1130          * NOTE: This only sets the maximum number of dwords that a compute
1131          * shader can allocate.  When a shader is executed, we still need to
1132          * allocate the appropriate amount of LDS dwords using the
1133          * CM_R_0288E8_SQ_LDS_ALLOC register.
1134          */
1135         if (rctx->b.chip_class < CAYMAN) {
1136                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1137                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1138         } else {
1139                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1140                         S_0286FC_NUM_PS_LDS(0) |
1141                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1142         }
1143
1144         /* Context Registers */
1145
1146         if (rctx->b.chip_class < CAYMAN) {
1147                 /* workaround for hw issues with dyn gpr - must set all limits
1148                  * to 240 instead of 0, 0x1e == 240 / 8
1149                  */
1150                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1151                                 S_028838_PS_GPRS(0x1e) |
1152                                 S_028838_VS_GPRS(0x1e) |
1153                                 S_028838_GS_GPRS(0x1e) |
1154                                 S_028838_ES_GPRS(0x1e) |
1155                                 S_028838_HS_GPRS(0x1e) |
1156                                 S_028838_LS_GPRS(0x1e));
1157         }
1158
1159         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1160         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1161                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1162
1163         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1164
1165         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1166                                S_0286E8_TID_IN_GROUP_ENA(1) |
1167                                S_0286E8_TGID_ENA(1) |
1168                                S_0286E8_DISABLE_INDEX_PACK(1));
1169
1170         /* The LOOP_CONST registers are an optimizations for loops that allows
1171          * you to store the initial counter, increment value, and maximum
1172          * counter value in a register so that hardware can calculate the
1173          * correct number of iterations for the loop, so that you don't need
1174          * to have the loop counter in your shader code.  We don't currently use
1175          * this optimization, so we must keep track of the counter in the
1176          * shader and use a break instruction to exit loops.  However, the
1177          * hardware will still uses this register to determine when to exit a
1178          * loop, so we need to initialize the counter to 0, set the increment
1179          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1180          * is the maximum value allowed.  This gives us a maximum of 4096
1181          * iterations for our loops, but hopefully our break instruction will
1182          * execute before some time before the 4096th iteration.
1183          */
1184         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1185 }
1186
1187 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1188 {
1189         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1190         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1191         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1192 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1193         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1194         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1195         rctx->b.b.launch_grid = evergreen_launch_grid;
1196
1197 }
1198
1199 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1200                                               struct pipe_resource *resource,
1201                                               unsigned level,
1202                                               unsigned usage,
1203                                               const struct pipe_box *box,
1204                                               struct pipe_transfer **ptransfer)
1205 {
1206         struct r600_context *rctx = (struct r600_context*)ctx;
1207         struct compute_memory_pool *pool = rctx->screen->global_pool;
1208         struct r600_resource_global* buffer =
1209                 (struct r600_resource_global*)resource;
1210
1211         struct compute_memory_item *item = buffer->chunk;
1212         struct pipe_resource *dst = NULL;
1213         unsigned offset = box->x;
1214
1215         if (is_item_in_pool(item)) {
1216                 compute_memory_demote_item(pool, item, ctx);
1217         }
1218         else {
1219                 if (item->real_buffer == NULL) {
1220                         item->real_buffer =
1221                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1222                 }
1223         }
1224
1225         dst = (struct pipe_resource*)item->real_buffer;
1226
1227         if (usage & PIPE_TRANSFER_READ)
1228                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1229
1230         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1231                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1232                         "width = %u, height = %u, depth = %u)\n", level, usage,
1233                         box->x, box->y, box->z, box->width, box->height,
1234                         box->depth);
1235         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1236                 "%u (box.x)\n", item->id, box->x);
1237
1238
1239         assert(resource->target == PIPE_BUFFER);
1240         assert(resource->bind & PIPE_BIND_GLOBAL);
1241         assert(box->x >= 0);
1242         assert(box->y == 0);
1243         assert(box->z == 0);
1244
1245         ///TODO: do it better, mapping is not possible if the pool is too big
1246         return pipe_buffer_map_range(ctx, dst,
1247                         offset, box->width, usage, ptransfer);
1248 }
1249
1250 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1251                                                struct pipe_transfer *transfer)
1252 {
1253         /* struct r600_resource_global are not real resources, they just map
1254          * to an offset within the compute memory pool.  The function
1255          * r600_compute_global_transfer_map() maps the memory pool
1256          * resource rather than the struct r600_resource_global passed to
1257          * it as an argument and then initalizes ptransfer->resource with
1258          * the memory pool resource (via pipe_buffer_map_range).
1259          * When transfer_unmap is called it uses the memory pool's
1260          * vtable which calls r600_buffer_transfer_map() rather than
1261          * this function.
1262          */
1263         assert (!"This function should not be called");
1264 }
1265
1266 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1267                                                       struct pipe_transfer *transfer,
1268                                                       const struct pipe_box *box)
1269 {
1270         assert(0 && "TODO");
1271 }
1272
1273 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1274                                                struct pipe_resource *res)
1275 {
1276         struct r600_resource_global* buffer = NULL;
1277         struct r600_screen* rscreen = NULL;
1278
1279         assert(res->target == PIPE_BUFFER);
1280         assert(res->bind & PIPE_BIND_GLOBAL);
1281
1282         buffer = (struct r600_resource_global*)res;
1283         rscreen = (struct r600_screen*)screen;
1284
1285         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1286
1287         buffer->chunk = NULL;
1288         free(res);
1289 }
1290
1291 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1292 {
1293         u_default_resource_get_handle, /* get_handle */
1294         r600_compute_global_buffer_destroy, /* resource_destroy */
1295         r600_compute_global_transfer_map, /* transfer_map */
1296         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1297         r600_compute_global_transfer_unmap, /* transfer_unmap */
1298 };
1299
1300 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1301                                                         const struct pipe_resource *templ)
1302 {
1303         struct r600_resource_global* result = NULL;
1304         struct r600_screen* rscreen = NULL;
1305         int size_in_dw = 0;
1306
1307         assert(templ->target == PIPE_BUFFER);
1308         assert(templ->bind & PIPE_BIND_GLOBAL);
1309         assert(templ->array_size == 1 || templ->array_size == 0);
1310         assert(templ->depth0 == 1 || templ->depth0 == 0);
1311         assert(templ->height0 == 1 || templ->height0 == 0);
1312
1313         result = (struct r600_resource_global*)
1314         CALLOC(sizeof(struct r600_resource_global), 1);
1315         rscreen = (struct r600_screen*)screen;
1316
1317         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1318         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1319                         templ->array_size);
1320
1321         result->base.b.vtbl = &r600_global_buffer_vtbl;
1322         result->base.b.b = *templ;
1323         result->base.b.b.screen = screen;
1324         pipe_reference_init(&result->base.b.b.reference, 1);
1325
1326         size_in_dw = (templ->width0+3) / 4;
1327
1328         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1329
1330         if (result->chunk == NULL)
1331         {
1332                 free(result);
1333                 return NULL;
1334         }
1335
1336         return &result->base.b.b;
1337 }