src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "tgsi/tgsi_parse.h"
  45 #include "pipebuffer/pb_buffer.h"
  46 #include "evergreend.h"
  47 #include "r600_shader.h"
  48 #include "r600_pipe.h"
  49 #include "r600_formats.h"
  50 #include "evergreen_compute.h"
  51 #include "evergreen_compute_internal.h"
  52 #include "compute_memory_pool.h"
  53 #include "sb/sb_public.h"
  54 #include <inttypes.h>
  55
  56 /**
  57 RAT0 is for global binding write
  58 VTX1 is for global binding read
  59
  60 for wrting images RAT1...
  61 for reading images TEX2...
  62   TEX2-RAT1 is paired
  63
  64 TEX2... consumes the same fetch resources, that VTX2... would consume
  65
  66 CONST0 and VTX0 is for parameters
  67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  68   also constant cached
  69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  70   the constant cache can handle
  71
  72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  74 we should reserve another one too.=> 10 image binding for writing max.
  75
  76 from Nvidia OpenCL:
  77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  79
  80 so 10 for writing is enough. 176 is the max for reading according to the docs
  81
  82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  83 writable images will consume TEX slots, VTX slots too because of linear indexing
  84
  85 */
  86
  87 MAYBE_UNUSED
  88 static void radeon_shader_binary_init(struct r600_shader_binary *b)
  89 {
  90         memset(b, 0, sizeof(*b));
  91 }
  92
  93 MAYBE_UNUSED
  94 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
  95 {
  96         if (!b)
  97                 return;
  98         FREE(b->code);
  99         FREE(b->config);
 100         FREE(b->rodata);
 101         FREE(b->global_symbol_offsets);
 102         FREE(b->relocs);
 103         FREE(b->disasm_string);
 104 }
 105
 106 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
 107                                                      unsigned size)
 108 {
 109         struct pipe_resource *buffer = NULL;
 110         assert(size);
 111
 112         buffer = pipe_buffer_create((struct pipe_screen*) screen,
 113                                     0, PIPE_USAGE_IMMUTABLE, size);
 114
 115         return (struct r600_resource *)buffer;
 116 }
 117
 118
 119 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 120                               unsigned id,
 121                               struct r600_resource *bo,
 122                               int start,
 123                               int size)
 124 {
 125         struct pipe_surface rat_templ;
 126         struct r600_surface *surf = NULL;
 127         struct r600_context *rctx = NULL;
 128
 129         assert(id < 12);
 130         assert((size & 3) == 0);
 131         assert((start & 0xFF) == 0);
 132
 133         rctx = pipe->ctx;
 134
 135         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 136
 137         /* Create the RAT surface */
 138         memset(&rat_templ, 0, sizeof(rat_templ));
 139         rat_templ.format = PIPE_FORMAT_R32_UINT;
 140         rat_templ.u.tex.level = 0;
 141         rat_templ.u.tex.first_layer = 0;
 142         rat_templ.u.tex.last_layer = 0;
 143
 144         /* Add the RAT the list of color buffers. Drop the old buffer first. */
 145         pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
 146         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 147                 (struct pipe_context *)pipe->ctx,
 148                 (struct pipe_resource *)bo, &rat_templ);
 149
 150         /* Update the number of color buffers */
 151         pipe->ctx->framebuffer.state.nr_cbufs =
 152                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 153
 154         /* Update the cb_target_mask
 155          * XXX: I think this is a potential spot for bugs once we start doing
 156          * GL interop.  cb_target_mask may be modified in the 3D sections
 157          * of this driver. */
 158         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 159
 160         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 161         evergreen_init_color_surface_rat(rctx, surf);
 162 }
 163
 164 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 165                                            unsigned vb_index,
 166                                            unsigned offset,
 167                                            struct pipe_resource *buffer)
 168 {
 169         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 170         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 171         vb->stride = 1;
 172         vb->buffer_offset = offset;
 173         vb->buffer.resource = buffer;
 174         vb->is_user_buffer = false;
 175
 176         /* The vertex instructions in the compute shaders use the texture cache,
 177          * so we need to invalidate it. */
 178         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 179         state->enabled_mask |= 1 << vb_index;
 180         state->dirty_mask |= 1 << vb_index;
 181         r600_mark_atom_dirty(rctx, &state->atom);
 182 }
 183
 184 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 185                                              unsigned cb_index,
 186                                              unsigned offset,
 187                                              unsigned size,
 188                                              struct pipe_resource *buffer)
 189 {
 190         struct pipe_constant_buffer cb;
 191         cb.buffer_size = size;
 192         cb.buffer_offset = offset;
 193         cb.buffer = buffer;
 194         cb.user_buffer = NULL;
 195
 196         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 197 }
 198
 199 /* We need to define these R600 registers here, because we can't include
 200  * evergreend.h and r600d.h.
 201  */
 202 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 203 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 204
 205 #ifdef HAVE_OPENCL
 206 static void parse_symbol_table(Elf_Data *symbol_table_data,
 207                                 const GElf_Shdr *symbol_table_header,
 208                                 struct r600_shader_binary *binary)
 209 {
 210         GElf_Sym symbol;
 211         unsigned i = 0;
 212         unsigned symbol_count =
 213                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 214
 215         /* We are over allocating this list, because symbol_count gives the
 216          * total number of symbols, and we will only be filling the list
 217          * with offsets of global symbols.  The memory savings from
 218          * allocating the correct size of this list will be small, and
 219          * I don't think it is worth the cost of pre-computing the number
 220          * of global symbols.
 221          */
 222         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 223
 224         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 225                 unsigned i;
 226                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 227                     symbol.st_shndx == 0 /* Undefined symbol */) {
 228                         continue;
 229                 }
 230
 231                 binary->global_symbol_offsets[binary->global_symbol_count] =
 232                                         symbol.st_value;
 233
 234                 /* Sort the list using bubble sort.  This list will usually
 235                  * be small. */
 236                 for (i = binary->global_symbol_count; i > 0; --i) {
 237                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 238                         uint64_t rhs = binary->global_symbol_offsets[i];
 239                         if (lhs < rhs) {
 240                                 break;
 241                         }
 242                         binary->global_symbol_offsets[i] = lhs;
 243                         binary->global_symbol_offsets[i - 1] = rhs;
 244                 }
 245                 ++binary->global_symbol_count;
 246         }
 247 }
 248
 249
 250 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 251                         unsigned symbol_sh_link,
 252                         struct r600_shader_binary *binary)
 253 {
 254         unsigned i;
 255
 256         if (!relocs || !symbols || !binary->reloc_count) {
 257                 return;
 258         }
 259         binary->relocs = CALLOC(binary->reloc_count,
 260                         sizeof(struct r600_shader_reloc));
 261         for (i = 0; i < binary->reloc_count; i++) {
 262                 GElf_Sym symbol;
 263                 GElf_Rel rel;
 264                 char *symbol_name;
 265                 struct r600_shader_reloc *reloc = &binary->relocs[i];
 266
 267                 gelf_getrel(relocs, i, &rel);
 268                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 269                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 270
 271                 reloc->offset = rel.r_offset;
 272                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 273                 reloc->name[sizeof(reloc->name)-1] = 0;
 274         }
 275 }
 276
 277 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 278                  struct r600_shader_binary *binary)
 279 {
 280         char *elf_buffer;
 281         Elf *elf;
 282         Elf_Scn *section = NULL;
 283         Elf_Data *symbols = NULL, *relocs = NULL;
 284         size_t section_str_index;
 285         unsigned symbol_sh_link = 0;
 286
 287         /* One of the libelf implementations
 288          * (http://www.mr511.de/software/english.htm) requires calling
 289          * elf_version() before elf_memory().
 290          */
 291         elf_version(EV_CURRENT);
 292         elf_buffer = MALLOC(elf_size);
 293         memcpy(elf_buffer, elf_data, elf_size);
 294
 295         elf = elf_memory(elf_buffer, elf_size);
 296
 297         elf_getshdrstrndx(elf, &section_str_index);
 298
 299         while ((section = elf_nextscn(elf, section))) {
 300                 const char *name;
 301                 Elf_Data *section_data = NULL;
 302                 GElf_Shdr section_header;
 303                 if (gelf_getshdr(section, &section_header) != &section_header) {
 304                         fprintf(stderr, "Failed to read ELF section header\n");
 305                         return;
 306                 }
 307                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 308                 if (!strcmp(name, ".text")) {
 309                         section_data = elf_getdata(section, section_data);
 310                         binary->code_size = section_data->d_size;
 311                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 312                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 313                 } else if (!strcmp(name, ".AMDGPU.config")) {
 314                         section_data = elf_getdata(section, section_data);
 315                         binary->config_size = section_data->d_size;
 316                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 317                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 318                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 319                         /* Always read disassembly if it's available. */
 320                         section_data = elf_getdata(section, section_data);
 321                         binary->disasm_string = strndup(section_data->d_buf,
 322                                                         section_data->d_size);
 323                 } else if (!strncmp(name, ".rodata", 7)) {
 324                         section_data = elf_getdata(section, section_data);
 325                         binary->rodata_size = section_data->d_size;
 326                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 327                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 328                 } else if (!strncmp(name, ".symtab", 7)) {
 329                         symbols = elf_getdata(section, section_data);
 330                         symbol_sh_link = section_header.sh_link;
 331                         parse_symbol_table(symbols, &section_header, binary);
 332                 } else if (!strcmp(name, ".rel.text")) {
 333                         relocs = elf_getdata(section, section_data);
 334                         binary->reloc_count = section_header.sh_size /
 335                                         section_header.sh_entsize;
 336                 }
 337         }
 338
 339         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 340
 341         if (elf){
 342                 elf_end(elf);
 343         }
 344         FREE(elf_buffer);
 345
 346         /* Cache the config size per symbol */
 347         if (binary->global_symbol_count) {
 348                 binary->config_size_per_symbol =
 349                         binary->config_size / binary->global_symbol_count;
 350         } else {
 351                 binary->global_symbol_count = 1;
 352                 binary->config_size_per_symbol = binary->config_size;
 353         }
 354 }
 355
 356 static const unsigned char *r600_shader_binary_config_start(
 357         const struct r600_shader_binary *binary,
 358         uint64_t symbol_offset)
 359 {
 360         unsigned i;
 361         for (i = 0; i < binary->global_symbol_count; ++i) {
 362                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 363                         unsigned offset = i * binary->config_size_per_symbol;
 364                         return binary->config + offset;
 365                 }
 366         }
 367         return binary->config;
 368 }
 369
 370 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
 371                                            struct r600_bytecode *bc,
 372                                            uint64_t symbol_offset,
 373                                            boolean *use_kill)
 374 {
 375        unsigned i;
 376        const unsigned char *config =
 377                r600_shader_binary_config_start(binary, symbol_offset);
 378
 379        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 380                unsigned reg =
 381                        util_le32_to_cpu(*(uint32_t*)(config + i));
 382                unsigned value =
 383                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 384                switch (reg) {
 385                /* R600 / R700 */
 386                case R_028850_SQ_PGM_RESOURCES_PS:
 387                case R_028868_SQ_PGM_RESOURCES_VS:
 388                /* Evergreen / Northern Islands */
 389                case R_028844_SQ_PGM_RESOURCES_PS:
 390                case R_028860_SQ_PGM_RESOURCES_VS:
 391                case R_0288D4_SQ_PGM_RESOURCES_LS:
 392                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 393                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 394                        break;
 395                case R_02880C_DB_SHADER_CONTROL:
 396                        *use_kill = G_02880C_KILL_ENABLE(value);
 397                        break;
 398                case R_0288E8_SQ_LDS_ALLOC:
 399                        bc->nlds_dw = value;
 400                        break;
 401                }
 402        }
 403 }
 404
 405 static unsigned r600_create_shader(struct r600_bytecode *bc,
 406                                    const struct r600_shader_binary *binary,
 407                                    boolean *use_kill)
 408
 409 {
 410         assert(binary->code_size % 4 == 0);
 411         bc->bytecode = CALLOC(1, binary->code_size);
 412         memcpy(bc->bytecode, binary->code, binary->code_size);
 413         bc->ndw = binary->code_size / 4;
 414
 415         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 416         return 0;
 417 }
 418
 419 #endif
 420
 421 static void r600_destroy_shader(struct r600_bytecode *bc)
 422 {
 423         FREE(bc->bytecode);
 424 }
 425
 426 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 427                                             const struct pipe_compute_state *cso)
 428 {
 429         struct r600_context *rctx = (struct r600_context *)ctx;
 430         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 431 #ifdef HAVE_OPENCL
 432         const struct pipe_llvm_program_header *header;
 433         const char *code;
 434         void *p;
 435         boolean use_kill;
 436 #endif
 437
 438         shader->ctx = rctx;
 439         shader->local_size = cso->req_local_mem;
 440         shader->private_size = cso->req_private_mem;
 441         shader->input_size = cso->req_input_mem;
 442
 443         shader->ir_type = cso->ir_type;
 444
 445         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 446                 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
 447                 return shader;
 448         }
 449 #ifdef HAVE_OPENCL
 450         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 451         header = cso->prog;
 452         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 453         radeon_shader_binary_init(&shader->binary);
 454         r600_elf_read(code, header->num_bytes, &shader->binary);
 455         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 456
 457         /* Upload code + ROdata */
 458         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 459                                                         shader->bc.ndw * 4);
 460         p = r600_buffer_map_sync_with_rings(
 461                 &rctx->b, shader->code_bo,
 462                 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
 463         //TODO: use util_memcpy_cpu_to_le32 ?
 464         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 465         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 466 #endif
 467
 468         return shader;
 469 }
 470
 471 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 472 {
 473         struct r600_context *rctx = (struct r600_context *)ctx;
 474         struct r600_pipe_compute *shader = state;
 475
 476         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 477
 478         if (!shader)
 479                 return;
 480
 481         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 482                 r600_delete_shader_selector(ctx, shader->sel);
 483         } else {
 484 #ifdef HAVE_OPENCL
 485                 radeon_shader_binary_clean(&shader->binary);
 486                 pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
 487                 pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
 488 #endif
 489                 r600_destroy_shader(&shader->bc);
 490         }
 491         FREE(shader);
 492 }
 493
 494 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 495 {
 496         struct r600_context *rctx = (struct r600_context *)ctx;
 497         struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
 498         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 499
 500         if (!state) {
 501                 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 502                 return;
 503         }
 504
 505         if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
 506                 bool compute_dirty;
 507
 508                 r600_shader_select(ctx, cstate->sel, &compute_dirty);
 509         }
 510
 511         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 512 }
 513
 514 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 515  * kernel parameters there are implicit parameters that need to be stored
 516  * in the vertex buffer as well.  Here is how these parameters are organized in
 517  * the buffer:
 518  *
 519  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 520  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 521  * DWORDS 6-8: Number of work items within each work group in each dimension
 522  *             (x,y,z)
 523  * DWORDS 9+ : Kernel parameters
 524  */
 525 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 526                                            const struct pipe_grid_info *info)
 527 {
 528         struct r600_context *rctx = (struct r600_context *)ctx;
 529         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 530         unsigned i;
 531         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 532          * parameters.
 533          */
 534         unsigned input_size;
 535         uint32_t *num_work_groups_start;
 536         uint32_t *global_size_start;
 537         uint32_t *local_size_start;
 538         uint32_t *kernel_parameters_start;
 539         struct pipe_box box;
 540         struct pipe_transfer *transfer = NULL;
 541
 542         if (!shader)
 543                 return;
 544         if (shader->input_size == 0) {
 545                 return;
 546         }
 547         input_size = shader->input_size + 36;
 548         if (!shader->kernel_param) {
 549                 /* Add space for the grid dimensions */
 550                 shader->kernel_param = (struct r600_resource *)
 551                         pipe_buffer_create(ctx->screen, 0,
 552                                         PIPE_USAGE_IMMUTABLE, input_size);
 553         }
 554
 555         u_box_1d(0, input_size, &box);
 556         num_work_groups_start = ctx->transfer_map(ctx,
 557                         (struct pipe_resource*)shader->kernel_param,
 558                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 559                         &box, &transfer);
 560         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 561         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 562         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 563
 564         /* Copy the work group size */
 565         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 566
 567         /* Copy the global size */
 568         for (i = 0; i < 3; i++) {
 569                 global_size_start[i] = info->grid[i] * info->block[i];
 570         }
 571
 572         /* Copy the local dimensions */
 573         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 574
 575         /* Copy the kernel inputs */
 576         memcpy(kernel_parameters_start, info->input, shader->input_size);
 577
 578         for (i = 0; i < (input_size / 4); i++) {
 579                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 580                         ((unsigned*)num_work_groups_start)[i]);
 581         }
 582
 583         ctx->transfer_unmap(ctx, transfer);
 584
 585         /* ID=0 and ID=3 are reserved for the parameters.
 586          * LLVM will preferably use ID=0, but it does not work for dynamic
 587          * indices. */
 588         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 589                         (struct pipe_resource*)shader->kernel_param);
 590         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 591                         (struct pipe_resource*)shader->kernel_param);
 592 }
 593
 594 static void evergreen_emit_dispatch(struct r600_context *rctx,
 595                                     const struct pipe_grid_info *info,
 596                                     uint32_t indirect_grid[3])
 597 {
 598         int i;
 599         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 600         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 601         bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 602         unsigned num_waves;
 603         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 604         unsigned wave_divisor = (16 * num_pipes);
 605         int group_size = 1;
 606         int grid_size = 1;
 607         unsigned lds_size = shader->local_size / 4;
 608
 609         if (shader->ir_type != PIPE_SHADER_IR_TGSI)
 610                 lds_size += shader->bc.nlds_dw;
 611
 612         /* Calculate group_size/grid_size */
 613         for (i = 0; i < 3; i++) {
 614                 group_size *= info->block[i];
 615         }
 616
 617         for (i = 0; i < 3; i++) {
 618                 grid_size *= info->grid[i];
 619         }
 620
 621         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 622         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 623                         wave_divisor - 1) / wave_divisor;
 624
 625         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 626                                 "%u wavefronts per thread block, "
 627                                 "allocating %u dwords lds.\n",
 628                                 num_pipes, num_waves, lds_size);
 629
 630         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 631
 632         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 633         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 634         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 635         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 636
 637         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 638                                                                 group_size);
 639
 640         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 641         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 642         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 643         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 644
 645         if (rctx->b.chip_class < CAYMAN) {
 646                 assert(lds_size <= 8192);
 647         } else {
 648                 /* Cayman appears to have a slightly smaller limit, see the
 649                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 650                 assert(lds_size <= 8160);
 651         }
 652
 653         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 654                                         lds_size | (num_waves << 14));
 655
 656         if (info->indirect) {
 657                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 658                 radeon_emit(cs, indirect_grid[0]);
 659                 radeon_emit(cs, indirect_grid[1]);
 660                 radeon_emit(cs, indirect_grid[2]);
 661                 radeon_emit(cs, 1);
 662         } else {
 663                 /* Dispatch packet */
 664                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 665                 radeon_emit(cs, info->grid[0]);
 666                 radeon_emit(cs, info->grid[1]);
 667                 radeon_emit(cs, info->grid[2]);
 668                 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 669                 radeon_emit(cs, 1);
 670         }
 671
 672         if (rctx->is_debug)
 673                 eg_trace_emit(rctx);
 674 }
 675
 676 static void compute_setup_cbs(struct r600_context *rctx)
 677 {
 678         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 679         unsigned i;
 680
 681         /* Emit colorbuffers. */
 682         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 683         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 684                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 685                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 686                                                        (struct r600_resource*)cb->base.texture,
 687                                                        RADEON_USAGE_READWRITE,
 688                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 689
 690                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 691                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 692                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 693                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 694                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 695                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 696                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 697                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 698
 699                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 700                 radeon_emit(cs, reloc);
 701
 702                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 703                 radeon_emit(cs, reloc);
 704         }
 705         for (; i < 8 ; i++)
 706                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 707                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 708         for (; i < 12; i++)
 709                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 710                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 711
 712         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 713         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 714                                        rctx->compute_cb_target_mask);
 715 }
 716
 717 static void compute_emit_cs(struct r600_context *rctx,
 718                             const struct pipe_grid_info *info)
 719 {
 720         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 721         bool compute_dirty = false;
 722         struct r600_pipe_shader *current;
 723         struct r600_shader_atomic combined_atomics[8];
 724         uint8_t atomic_used_mask;
 725         uint32_t indirect_grid[3] = { 0, 0, 0 };
 726
 727         /* make sure that the gfx ring is only one active */
 728         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 729                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 730         }
 731
 732         r600_update_compressed_resource_state(rctx, true);
 733
 734         if (!rctx->cmd_buf_is_compute) {
 735                 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 736                 rctx->cmd_buf_is_compute = true;
 737         }
 738
 739         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 740                 r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
 741                 current = rctx->cs_shader_state.shader->sel->current;
 742                 if (compute_dirty) {
 743                         rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
 744                         r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
 745                         r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
 746                 }
 747
 748                 bool need_buf_const = current->shader.uses_tex_buffers ||
 749                         current->shader.has_txq_cube_array_z_comp;
 750
 751                 if (info->indirect) {
 752                         struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
 753                         unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
 754                         unsigned offset = info->indirect_offset / 4;
 755                         indirect_grid[0] = data[offset];
 756                         indirect_grid[1] = data[offset + 1];
 757                         indirect_grid[2] = data[offset + 2];
 758                 }
 759                 for (int i = 0; i < 3; i++) {
 760                         rctx->cs_block_grid_sizes[i] = info->block[i];
 761                         rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
 762                 }
 763                 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 764                 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
 765
 766                 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
 767                 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
 768
 769                 if (need_buf_const) {
 770                         eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 771                 }
 772                 r600_update_driver_const_buffers(rctx, true);
 773
 774                 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
 775                 if (atomic_used_mask) {
 776                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 777                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 778                 }
 779         } else
 780                 r600_need_cs_space(rctx, 0, true, 0);
 781
 782         /* Initialize all the compute-related registers.
 783          *
 784          * See evergreen_init_atom_start_compute_cs() in this file for the list
 785          * of registers initialized by the start_compute_cs_cmd atom.
 786          */
 787         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 788
 789         /* emit config state */
 790         if (rctx->b.chip_class == EVERGREEN) {
 791                 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 792                         radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
 793                         radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
 794                         radeon_emit(cs, 0);
 795                         radeon_emit(cs, 0);
 796                         radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
 797                 } else
 798                         r600_emit_atom(rctx, &rctx->config_state.atom);
 799         }
 800
 801         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 802         r600_flush_emit(rctx);
 803
 804         if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
 805
 806                 compute_setup_cbs(rctx);
 807
 808                 /* Emit vertex buffer state */
 809                 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 810                 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 811         } else {
 812                 uint32_t rat_mask;
 813
 814                 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
 815                 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 816                                                rat_mask);
 817         }
 818
 819         r600_emit_atom(rctx, &rctx->b.render_cond_atom);
 820
 821         /* Emit constant buffer state */
 822         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 823
 824         /* Emit sampler state */
 825         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 826
 827         /* Emit sampler view (texture resource) state */
 828         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 829
 830         /* Emit images state */
 831         r600_emit_atom(rctx, &rctx->compute_images.atom);
 832
 833         /* Emit buffers state */
 834         r600_emit_atom(rctx, &rctx->compute_buffers.atom);
 835
 836         /* Emit shader state */
 837         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 838
 839         /* Emit dispatch state and dispatch packet */
 840         evergreen_emit_dispatch(rctx, info, indirect_grid);
 841
 842         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 843          */
 844         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 845                       R600_CONTEXT_INV_VERTEX_CACHE |
 846                       R600_CONTEXT_INV_TEX_CACHE;
 847         r600_flush_emit(rctx);
 848         rctx->b.flags = 0;
 849
 850         if (rctx->b.chip_class >= CAYMAN) {
 851                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 852                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 853                 /* DEALLOC_STATE prevents the GPU from hanging when a
 854                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 855                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 856                  */
 857                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 858                 radeon_emit(cs, 0);
 859         }
 860         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
 861                 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
 862
 863 #if 0
 864         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 865         for (i = 0; i < cs->cdw; i++) {
 866                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 867         }
 868 #endif
 869
 870 }
 871
 872
 873 /**
 874  * Emit function for r600_cs_shader_state atom
 875  */
 876 void evergreen_emit_cs_shader(struct r600_context *rctx,
 877                               struct r600_atom *atom)
 878 {
 879         struct r600_cs_shader_state *state =
 880                                         (struct r600_cs_shader_state*)atom;
 881         struct r600_pipe_compute *shader = state->shader;
 882         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 883         uint64_t va;
 884         struct r600_resource *code_bo;
 885         unsigned ngpr, nstack;
 886
 887         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 888                 code_bo = shader->sel->current->bo;
 889                 va = shader->sel->current->bo->gpu_address;
 890                 ngpr = shader->sel->current->shader.bc.ngpr;
 891                 nstack = shader->sel->current->shader.bc.nstack;
 892         } else {
 893                 code_bo = shader->code_bo;
 894                 va = shader->code_bo->gpu_address + state->pc;
 895                 ngpr = shader->bc.ngpr;
 896                 nstack = shader->bc.nstack;
 897         }
 898
 899         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 900         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 901         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 902                         S_0288D4_NUM_GPRS(ngpr) |
 903                         S_0288D4_DX10_CLAMP(1) |
 904                         S_0288D4_STACK_SIZE(nstack));
 905         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 906
 907         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 908         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 909                                               code_bo, RADEON_USAGE_READ,
 910                                               RADEON_PRIO_SHADER_BINARY));
 911 }
 912
 913 static void evergreen_launch_grid(struct pipe_context *ctx,
 914                                   const struct pipe_grid_info *info)
 915 {
 916         struct r600_context *rctx = (struct r600_context *)ctx;
 917 #ifdef HAVE_OPENCL
 918         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 919         boolean use_kill;
 920
 921         if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
 922                 rctx->cs_shader_state.pc = info->pc;
 923                 /* Get the config information for this kernel. */
 924                 r600_shader_binary_read_config(&shader->binary, &shader->bc,
 925                                                info->pc, &use_kill);
 926         } else {
 927                 use_kill = false;
 928                 rctx->cs_shader_state.pc = 0;
 929         }
 930 #endif
 931
 932         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 933
 934
 935         evergreen_compute_upload_input(ctx, info);
 936         compute_emit_cs(rctx, info);
 937 }
 938
 939 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 940                                             unsigned start, unsigned count,
 941                                             struct pipe_surface **surfaces)
 942 {
 943         struct r600_context *rctx = (struct r600_context *)ctx;
 944         struct r600_surface **resources = (struct r600_surface **)surfaces;
 945
 946         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 947                         start, count);
 948
 949         for (unsigned i = 0; i < count; i++) {
 950                 /* The First four vertex buffers are reserved for parameters and
 951                  * global buffers. */
 952                 unsigned vtx_id = 4 + i;
 953                 if (resources[i]) {
 954                         struct r600_resource_global *buffer =
 955                                 (struct r600_resource_global*)
 956                                 resources[i]->base.texture;
 957                         if (resources[i]->base.writable) {
 958                                 assert(i+1 < 12);
 959
 960                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 961                                 (struct r600_resource *)resources[i]->base.texture,
 962                                 buffer->chunk->start_in_dw*4,
 963                                 resources[i]->base.texture->width0);
 964                         }
 965
 966                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 967                                         buffer->chunk->start_in_dw * 4,
 968                                         resources[i]->base.texture);
 969                 }
 970         }
 971 }
 972
 973 static void evergreen_set_global_binding(struct pipe_context *ctx,
 974                                          unsigned first, unsigned n,
 975                                          struct pipe_resource **resources,
 976                                          uint32_t **handles)
 977 {
 978         struct r600_context *rctx = (struct r600_context *)ctx;
 979         struct compute_memory_pool *pool = rctx->screen->global_pool;
 980         struct r600_resource_global **buffers =
 981                 (struct r600_resource_global **)resources;
 982         unsigned i;
 983
 984         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 985                         first, n);
 986
 987         if (!resources) {
 988                 /* XXX: Unset */
 989                 return;
 990         }
 991
 992         /* We mark these items for promotion to the pool if they
 993          * aren't already there */
 994         for (i = first; i < first + n; i++) {
 995                 struct compute_memory_item *item = buffers[i]->chunk;
 996
 997                 if (!is_item_in_pool(item))
 998                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 999         }
1000
1001         if (compute_memory_finalize_pending(pool, ctx) == -1) {
1002                 /* XXX: Unset */
1003                 return;
1004         }
1005
1006         for (i = first; i < first + n; i++)
1007         {
1008                 uint32_t buffer_offset;
1009                 uint32_t handle;
1010                 assert(resources[i]->target == PIPE_BUFFER);
1011                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1012
1013                 buffer_offset = util_le32_to_cpu(*(handles[i]));
1014                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1015
1016                 *(handles[i]) = util_cpu_to_le32(handle);
1017         }
1018
1019         /* globals for writing */
1020         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1021         /* globals for reading */
1022         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1023                                 (struct pipe_resource*)pool->bo);
1024
1025         /* constants for reading, LLVM puts them in text segment */
1026         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1027                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1028 }
1029
1030 /**
1031  * This function initializes all the compute specific registers that need to
1032  * be initialized for each compute command stream.  Registers that are common
1033  * to both compute and 3D will be initialized at the beginning of each compute
1034  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1035  * packet requires that the shader type bit be set, we must initialize all
1036  * context registers needed for compute in this function.  The registers
1037  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1038  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1039  * on the GPU family.
1040  */
1041 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1042 {
1043         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1044         int num_threads;
1045         int num_stack_entries;
1046
1047         /* since all required registers are initialized in the
1048          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1049          */
1050         r600_init_command_buffer(cb, 256);
1051         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1052
1053         /* We're setting config registers here. */
1054         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1055         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1056
1057         switch (rctx->b.family) {
1058         case CHIP_CEDAR:
1059         default:
1060                 num_threads = 128;
1061                 num_stack_entries = 256;
1062                 break;
1063         case CHIP_REDWOOD:
1064                 num_threads = 128;
1065                 num_stack_entries = 256;
1066                 break;
1067         case CHIP_JUNIPER:
1068                 num_threads = 128;
1069                 num_stack_entries = 512;
1070                 break;
1071         case CHIP_CYPRESS:
1072         case CHIP_HEMLOCK:
1073                 num_threads = 128;
1074                 num_stack_entries = 512;
1075                 break;
1076         case CHIP_PALM:
1077                 num_threads = 128;
1078                 num_stack_entries = 256;
1079                 break;
1080         case CHIP_SUMO:
1081                 num_threads = 128;
1082                 num_stack_entries = 256;
1083                 break;
1084         case CHIP_SUMO2:
1085                 num_threads = 128;
1086                 num_stack_entries = 512;
1087                 break;
1088         case CHIP_BARTS:
1089                 num_threads = 128;
1090                 num_stack_entries = 512;
1091                 break;
1092         case CHIP_TURKS:
1093                 num_threads = 128;
1094                 num_stack_entries = 256;
1095                 break;
1096         case CHIP_CAICOS:
1097                 num_threads = 128;
1098                 num_stack_entries = 256;
1099                 break;
1100         }
1101
1102         /* The primitive type always needs to be POINTLIST for compute. */
1103         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1104                                                 V_008958_DI_PT_POINTLIST);
1105
1106         if (rctx->b.chip_class < CAYMAN) {
1107
1108                 /* These registers control which simds can be used by each stage.
1109                  * The default for these registers is 0xffffffff, which means
1110                  * all simds are available for each stage.  It's possible we may
1111                  * want to play around with these in the future, but for now
1112                  * the default value is fine.
1113                  *
1114                  * R_008E20_SQ_STATIC_THREAD_MGMT1
1115                  * R_008E24_SQ_STATIC_THREAD_MGMT2
1116                  * R_008E28_SQ_STATIC_THREAD_MGMT3
1117                  */
1118
1119                 /* XXX: We may need to adjust the thread and stack resource
1120                  * values for 3D/compute interop */
1121
1122                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1123
1124                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1125                  * Set the number of threads used by the PS/VS/GS/ES stage to
1126                  * 0.
1127                  */
1128                 r600_store_value(cb, 0);
1129
1130                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1131                  * Set the number of threads used by the CS (aka LS) stage to
1132                  * the maximum number of threads and set the number of threads
1133                  * for the HS stage to 0. */
1134                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1135
1136                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1137                  * Set the Control Flow stack entries to 0 for PS/VS stages */
1138                 r600_store_value(cb, 0);
1139
1140                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1141                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1142                 r600_store_value(cb, 0);
1143
1144                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1145                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1146                  * set it to the maximum value for the CS (aka LS) stage. */
1147                 r600_store_value(cb,
1148                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1149         }
1150         /* Give the compute shader all the available LDS space.
1151          * NOTE: This only sets the maximum number of dwords that a compute
1152          * shader can allocate.  When a shader is executed, we still need to
1153          * allocate the appropriate amount of LDS dwords using the
1154          * CM_R_0288E8_SQ_LDS_ALLOC register.
1155          */
1156         if (rctx->b.chip_class < CAYMAN) {
1157                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1158                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1159         } else {
1160                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1161                         S_0286FC_NUM_PS_LDS(0) |
1162                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1163         }
1164
1165         /* Context Registers */
1166
1167         if (rctx->b.chip_class < CAYMAN) {
1168                 /* workaround for hw issues with dyn gpr - must set all limits
1169                  * to 240 instead of 0, 0x1e == 240 / 8
1170                  */
1171                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1172                                 S_028838_PS_GPRS(0x1e) |
1173                                 S_028838_VS_GPRS(0x1e) |
1174                                 S_028838_GS_GPRS(0x1e) |
1175                                 S_028838_ES_GPRS(0x1e) |
1176                                 S_028838_HS_GPRS(0x1e) |
1177                                 S_028838_LS_GPRS(0x1e));
1178         }
1179
1180         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1181         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1182                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1183
1184         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1185
1186         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1187                                S_0286E8_TID_IN_GROUP_ENA(1) |
1188                                S_0286E8_TGID_ENA(1) |
1189                                S_0286E8_DISABLE_INDEX_PACK(1));
1190
1191         /* The LOOP_CONST registers are an optimizations for loops that allows
1192          * you to store the initial counter, increment value, and maximum
1193          * counter value in a register so that hardware can calculate the
1194          * correct number of iterations for the loop, so that you don't need
1195          * to have the loop counter in your shader code.  We don't currently use
1196          * this optimization, so we must keep track of the counter in the
1197          * shader and use a break instruction to exit loops.  However, the
1198          * hardware will still uses this register to determine when to exit a
1199          * loop, so we need to initialize the counter to 0, set the increment
1200          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1201          * is the maximum value allowed.  This gives us a maximum of 4096
1202          * iterations for our loops, but hopefully our break instruction will
1203          * execute before some time before the 4096th iteration.
1204          */
1205         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1206 }
1207
1208 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1209 {
1210         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1211         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1212         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1213 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1214         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1215         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1216         rctx->b.b.launch_grid = evergreen_launch_grid;
1217
1218 }
1219
1220 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1221                                               struct pipe_resource *resource,
1222                                               unsigned level,
1223                                               unsigned usage,
1224                                               const struct pipe_box *box,
1225                                               struct pipe_transfer **ptransfer)
1226 {
1227         struct r600_context *rctx = (struct r600_context*)ctx;
1228         struct compute_memory_pool *pool = rctx->screen->global_pool;
1229         struct r600_resource_global* buffer =
1230                 (struct r600_resource_global*)resource;
1231
1232         struct compute_memory_item *item = buffer->chunk;
1233         struct pipe_resource *dst = NULL;
1234         unsigned offset = box->x;
1235
1236         if (is_item_in_pool(item)) {
1237                 compute_memory_demote_item(pool, item, ctx);
1238         }
1239         else {
1240                 if (item->real_buffer == NULL) {
1241                         item->real_buffer =
1242                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1243                 }
1244         }
1245
1246         dst = (struct pipe_resource*)item->real_buffer;
1247
1248         if (usage & PIPE_TRANSFER_READ)
1249                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1250
1251         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1252                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1253                         "width = %u, height = %u, depth = %u)\n", level, usage,
1254                         box->x, box->y, box->z, box->width, box->height,
1255                         box->depth);
1256         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1257                 "%u (box.x)\n", item->id, box->x);
1258
1259
1260         assert(resource->target == PIPE_BUFFER);
1261         assert(resource->bind & PIPE_BIND_GLOBAL);
1262         assert(box->x >= 0);
1263         assert(box->y == 0);
1264         assert(box->z == 0);
1265
1266         ///TODO: do it better, mapping is not possible if the pool is too big
1267         return pipe_buffer_map_range(ctx, dst,
1268                         offset, box->width, usage, ptransfer);
1269 }
1270
1271 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1272                                                struct pipe_transfer *transfer)
1273 {
1274         /* struct r600_resource_global are not real resources, they just map
1275          * to an offset within the compute memory pool.  The function
1276          * r600_compute_global_transfer_map() maps the memory pool
1277          * resource rather than the struct r600_resource_global passed to
1278          * it as an argument and then initalizes ptransfer->resource with
1279          * the memory pool resource (via pipe_buffer_map_range).
1280          * When transfer_unmap is called it uses the memory pool's
1281          * vtable which calls r600_buffer_transfer_map() rather than
1282          * this function.
1283          */
1284         assert (!"This function should not be called");
1285 }
1286
1287 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1288                                                       struct pipe_transfer *transfer,
1289                                                       const struct pipe_box *box)
1290 {
1291         assert(0 && "TODO");
1292 }
1293
1294 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1295                                                struct pipe_resource *res)
1296 {
1297         struct r600_resource_global* buffer = NULL;
1298         struct r600_screen* rscreen = NULL;
1299
1300         assert(res->target == PIPE_BUFFER);
1301         assert(res->bind & PIPE_BIND_GLOBAL);
1302
1303         buffer = (struct r600_resource_global*)res;
1304         rscreen = (struct r600_screen*)screen;
1305
1306         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1307
1308         buffer->chunk = NULL;
1309         free(res);
1310 }
1311
1312 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1313 {
1314         u_default_resource_get_handle, /* get_handle */
1315         r600_compute_global_buffer_destroy, /* resource_destroy */
1316         r600_compute_global_transfer_map, /* transfer_map */
1317         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1318         r600_compute_global_transfer_unmap, /* transfer_unmap */
1319 };
1320
1321 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1322                                                         const struct pipe_resource *templ)
1323 {
1324         struct r600_resource_global* result = NULL;
1325         struct r600_screen* rscreen = NULL;
1326         int size_in_dw = 0;
1327
1328         assert(templ->target == PIPE_BUFFER);
1329         assert(templ->bind & PIPE_BIND_GLOBAL);
1330         assert(templ->array_size == 1 || templ->array_size == 0);
1331         assert(templ->depth0 == 1 || templ->depth0 == 0);
1332         assert(templ->height0 == 1 || templ->height0 == 0);
1333
1334         result = (struct r600_resource_global*)
1335         CALLOC(sizeof(struct r600_resource_global), 1);
1336         rscreen = (struct r600_screen*)screen;
1337
1338         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1339         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1340                         templ->array_size);
1341
1342         result->base.b.vtbl = &r600_global_buffer_vtbl;
1343         result->base.b.b = *templ;
1344         result->base.b.b.screen = screen;
1345         pipe_reference_init(&result->base.b.b.reference, 1);
1346
1347         size_in_dw = (templ->width0+3) / 4;
1348
1349         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1350
1351         if (result->chunk == NULL)
1352         {
1353                 free(result);
1354                 return NULL;
1355         }
1356
1357         return &result->base.b.b;
1358 }