src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "tgsi/tgsi_parse.h"
  45 #include "pipebuffer/pb_buffer.h"
  46 #include "evergreend.h"
  47 #include "r600_shader.h"
  48 #include "r600_pipe.h"
  49 #include "r600_formats.h"
  50 #include "evergreen_compute.h"
  51 #include "evergreen_compute_internal.h"
  52 #include "compute_memory_pool.h"
  53 #include "sb/sb_public.h"
  54 #include <inttypes.h>
  55
  56 /**
  57 RAT0 is for global binding write
  58 VTX1 is for global binding read
  59
  60 for wrting images RAT1...
  61 for reading images TEX2...
  62   TEX2-RAT1 is paired
  63
  64 TEX2... consumes the same fetch resources, that VTX2... would consume
  65
  66 CONST0 and VTX0 is for parameters
  67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  68   also constant cached
  69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  70   the constant cache can handle
  71
  72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  74 we should reserve another one too.=> 10 image binding for writing max.
  75
  76 from Nvidia OpenCL:
  77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  79
  80 so 10 for writing is enough. 176 is the max for reading according to the docs
  81
  82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  83 writable images will consume TEX slots, VTX slots too because of linear indexing
  84
  85 */
  86
  87 #ifdef HAVE_OPENCL
  88 static void radeon_shader_binary_init(struct r600_shader_binary *b)
  89 {
  90         memset(b, 0, sizeof(*b));
  91 }
  92
  93 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
  94 {
  95         if (!b)
  96                 return;
  97         FREE(b->code);
  98         FREE(b->config);
  99         FREE(b->rodata);
 100         FREE(b->global_symbol_offsets);
 101         FREE(b->relocs);
 102         FREE(b->disasm_string);
 103 }
 104 #endif
 105
 106 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
 107                                                      unsigned size)
 108 {
 109         struct pipe_resource *buffer = NULL;
 110         assert(size);
 111
 112         buffer = pipe_buffer_create((struct pipe_screen*) screen,
 113                                     0, PIPE_USAGE_IMMUTABLE, size);
 114
 115         return (struct r600_resource *)buffer;
 116 }
 117
 118
 119 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 120                               unsigned id,
 121                               struct r600_resource *bo,
 122                               int start,
 123                               int size)
 124 {
 125         struct pipe_surface rat_templ;
 126         struct r600_surface *surf = NULL;
 127         struct r600_context *rctx = NULL;
 128
 129         assert(id < 12);
 130         assert((size & 3) == 0);
 131         assert((start & 0xFF) == 0);
 132
 133         rctx = pipe->ctx;
 134
 135         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 136
 137         /* Create the RAT surface */
 138         memset(&rat_templ, 0, sizeof(rat_templ));
 139         rat_templ.format = PIPE_FORMAT_R32_UINT;
 140         rat_templ.u.tex.level = 0;
 141         rat_templ.u.tex.first_layer = 0;
 142         rat_templ.u.tex.last_layer = 0;
 143
 144         /* Add the RAT the list of color buffers. Drop the old buffer first. */
 145         pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
 146         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 147                 (struct pipe_context *)pipe->ctx,
 148                 (struct pipe_resource *)bo, &rat_templ);
 149
 150         /* Update the number of color buffers */
 151         pipe->ctx->framebuffer.state.nr_cbufs =
 152                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 153
 154         /* Update the cb_target_mask
 155          * XXX: I think this is a potential spot for bugs once we start doing
 156          * GL interop.  cb_target_mask may be modified in the 3D sections
 157          * of this driver. */
 158         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 159
 160         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 161         evergreen_init_color_surface_rat(rctx, surf);
 162 }
 163
 164 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 165                                            unsigned vb_index,
 166                                            unsigned offset,
 167                                            struct pipe_resource *buffer)
 168 {
 169         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 170         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 171         vb->stride = 1;
 172         vb->buffer_offset = offset;
 173         vb->buffer.resource = buffer;
 174         vb->is_user_buffer = false;
 175
 176         /* The vertex instructions in the compute shaders use the texture cache,
 177          * so we need to invalidate it. */
 178         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 179         state->enabled_mask |= 1 << vb_index;
 180         state->dirty_mask |= 1 << vb_index;
 181         r600_mark_atom_dirty(rctx, &state->atom);
 182 }
 183
 184 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 185                                              unsigned cb_index,
 186                                              unsigned offset,
 187                                              unsigned size,
 188                                              struct pipe_resource *buffer)
 189 {
 190         struct pipe_constant_buffer cb;
 191         cb.buffer_size = size;
 192         cb.buffer_offset = offset;
 193         cb.buffer = buffer;
 194         cb.user_buffer = NULL;
 195
 196         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 197 }
 198
 199 /* We need to define these R600 registers here, because we can't include
 200  * evergreend.h and r600d.h.
 201  */
 202 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 203 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 204
 205 #ifdef HAVE_OPENCL
 206 static void parse_symbol_table(Elf_Data *symbol_table_data,
 207                                 const GElf_Shdr *symbol_table_header,
 208                                 struct r600_shader_binary *binary)
 209 {
 210         GElf_Sym symbol;
 211         unsigned i = 0;
 212         unsigned symbol_count =
 213                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 214
 215         /* We are over allocating this list, because symbol_count gives the
 216          * total number of symbols, and we will only be filling the list
 217          * with offsets of global symbols.  The memory savings from
 218          * allocating the correct size of this list will be small, and
 219          * I don't think it is worth the cost of pre-computing the number
 220          * of global symbols.
 221          */
 222         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 223
 224         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 225                 unsigned i;
 226                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 227                     symbol.st_shndx == 0 /* Undefined symbol */) {
 228                         continue;
 229                 }
 230
 231                 binary->global_symbol_offsets[binary->global_symbol_count] =
 232                                         symbol.st_value;
 233
 234                 /* Sort the list using bubble sort.  This list will usually
 235                  * be small. */
 236                 for (i = binary->global_symbol_count; i > 0; --i) {
 237                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 238                         uint64_t rhs = binary->global_symbol_offsets[i];
 239                         if (lhs < rhs) {
 240                                 break;
 241                         }
 242                         binary->global_symbol_offsets[i] = lhs;
 243                         binary->global_symbol_offsets[i - 1] = rhs;
 244                 }
 245                 ++binary->global_symbol_count;
 246         }
 247 }
 248
 249
 250 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 251                         unsigned symbol_sh_link,
 252                         struct r600_shader_binary *binary)
 253 {
 254         unsigned i;
 255
 256         if (!relocs || !symbols || !binary->reloc_count) {
 257                 return;
 258         }
 259         binary->relocs = CALLOC(binary->reloc_count,
 260                         sizeof(struct r600_shader_reloc));
 261         for (i = 0; i < binary->reloc_count; i++) {
 262                 GElf_Sym symbol;
 263                 GElf_Rel rel;
 264                 char *symbol_name;
 265                 struct r600_shader_reloc *reloc = &binary->relocs[i];
 266
 267                 gelf_getrel(relocs, i, &rel);
 268                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 269                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 270
 271                 reloc->offset = rel.r_offset;
 272                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 273                 reloc->name[sizeof(reloc->name)-1] = 0;
 274         }
 275 }
 276
 277 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 278                  struct r600_shader_binary *binary)
 279 {
 280         char *elf_buffer;
 281         Elf *elf;
 282         Elf_Scn *section = NULL;
 283         Elf_Data *symbols = NULL, *relocs = NULL;
 284         size_t section_str_index;
 285         unsigned symbol_sh_link = 0;
 286
 287         /* One of the libelf implementations
 288          * (http://www.mr511.de/software/english.htm) requires calling
 289          * elf_version() before elf_memory().
 290          */
 291         elf_version(EV_CURRENT);
 292         elf_buffer = MALLOC(elf_size);
 293         memcpy(elf_buffer, elf_data, elf_size);
 294
 295         elf = elf_memory(elf_buffer, elf_size);
 296
 297         elf_getshdrstrndx(elf, &section_str_index);
 298
 299         while ((section = elf_nextscn(elf, section))) {
 300                 const char *name;
 301                 Elf_Data *section_data = NULL;
 302                 GElf_Shdr section_header;
 303                 if (gelf_getshdr(section, &section_header) != &section_header) {
 304                         fprintf(stderr, "Failed to read ELF section header\n");
 305                         return;
 306                 }
 307                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 308                 if (!strcmp(name, ".text")) {
 309                         section_data = elf_getdata(section, section_data);
 310                         binary->code_size = section_data->d_size;
 311                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 312                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 313                 } else if (!strcmp(name, ".AMDGPU.config")) {
 314                         section_data = elf_getdata(section, section_data);
 315                         binary->config_size = section_data->d_size;
 316                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 317                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 318                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 319                         /* Always read disassembly if it's available. */
 320                         section_data = elf_getdata(section, section_data);
 321                         binary->disasm_string = strndup(section_data->d_buf,
 322                                                         section_data->d_size);
 323                 } else if (!strncmp(name, ".rodata", 7)) {
 324                         section_data = elf_getdata(section, section_data);
 325                         binary->rodata_size = section_data->d_size;
 326                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 327                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 328                 } else if (!strncmp(name, ".symtab", 7)) {
 329                         symbols = elf_getdata(section, section_data);
 330                         symbol_sh_link = section_header.sh_link;
 331                         parse_symbol_table(symbols, &section_header, binary);
 332                 } else if (!strcmp(name, ".rel.text")) {
 333                         relocs = elf_getdata(section, section_data);
 334                         binary->reloc_count = section_header.sh_size /
 335                                         section_header.sh_entsize;
 336                 }
 337         }
 338
 339         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 340
 341         if (elf){
 342                 elf_end(elf);
 343         }
 344         FREE(elf_buffer);
 345
 346         /* Cache the config size per symbol */
 347         if (binary->global_symbol_count) {
 348                 binary->config_size_per_symbol =
 349                         binary->config_size / binary->global_symbol_count;
 350         } else {
 351                 binary->global_symbol_count = 1;
 352                 binary->config_size_per_symbol = binary->config_size;
 353         }
 354 }
 355
 356 static const unsigned char *r600_shader_binary_config_start(
 357         const struct r600_shader_binary *binary,
 358         uint64_t symbol_offset)
 359 {
 360         unsigned i;
 361         for (i = 0; i < binary->global_symbol_count; ++i) {
 362                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 363                         unsigned offset = i * binary->config_size_per_symbol;
 364                         return binary->config + offset;
 365                 }
 366         }
 367         return binary->config;
 368 }
 369
 370 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
 371                                            struct r600_bytecode *bc,
 372                                            uint64_t symbol_offset,
 373                                            boolean *use_kill)
 374 {
 375        unsigned i;
 376        const unsigned char *config =
 377                r600_shader_binary_config_start(binary, symbol_offset);
 378
 379        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 380                unsigned reg =
 381                        util_le32_to_cpu(*(uint32_t*)(config + i));
 382                unsigned value =
 383                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 384                switch (reg) {
 385                /* R600 / R700 */
 386                case R_028850_SQ_PGM_RESOURCES_PS:
 387                case R_028868_SQ_PGM_RESOURCES_VS:
 388                /* Evergreen / Northern Islands */
 389                case R_028844_SQ_PGM_RESOURCES_PS:
 390                case R_028860_SQ_PGM_RESOURCES_VS:
 391                case R_0288D4_SQ_PGM_RESOURCES_LS:
 392                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 393                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 394                        break;
 395                case R_02880C_DB_SHADER_CONTROL:
 396                        *use_kill = G_02880C_KILL_ENABLE(value);
 397                        break;
 398                case R_0288E8_SQ_LDS_ALLOC:
 399                        bc->nlds_dw = value;
 400                        break;
 401                }
 402        }
 403 }
 404
 405 static unsigned r600_create_shader(struct r600_bytecode *bc,
 406                                    const struct r600_shader_binary *binary,
 407                                    boolean *use_kill)
 408
 409 {
 410         assert(binary->code_size % 4 == 0);
 411         bc->bytecode = CALLOC(1, binary->code_size);
 412         memcpy(bc->bytecode, binary->code, binary->code_size);
 413         bc->ndw = binary->code_size / 4;
 414
 415         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 416         return 0;
 417 }
 418
 419 #endif
 420
 421 static void r600_destroy_shader(struct r600_bytecode *bc)
 422 {
 423         FREE(bc->bytecode);
 424 }
 425
 426 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 427                                             const struct pipe_compute_state *cso)
 428 {
 429         struct r600_context *rctx = (struct r600_context *)ctx;
 430         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 431 #ifdef HAVE_OPENCL
 432         const struct pipe_llvm_program_header *header;
 433         void *p;
 434         boolean use_kill;
 435 #endif
 436
 437         shader->ctx = rctx;
 438         shader->local_size = cso->req_local_mem;
 439         shader->private_size = cso->req_private_mem;
 440         shader->input_size = cso->req_input_mem;
 441
 442         shader->ir_type = cso->ir_type;
 443
 444         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 445                 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
 446                 return shader;
 447         }
 448 #ifdef HAVE_OPENCL
 449         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 450         header = cso->prog;
 451         radeon_shader_binary_init(&shader->binary);
 452         r600_elf_read(header->blob, header->num_bytes, &shader->binary);
 453         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 454
 455         /* Upload code + ROdata */
 456         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 457                                                         shader->bc.ndw * 4);
 458         p = r600_buffer_map_sync_with_rings(
 459                 &rctx->b, shader->code_bo,
 460                 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
 461         //TODO: use util_memcpy_cpu_to_le32 ?
 462         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 463         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 464 #endif
 465
 466         return shader;
 467 }
 468
 469 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 470 {
 471         struct r600_context *rctx = (struct r600_context *)ctx;
 472         struct r600_pipe_compute *shader = state;
 473
 474         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 475
 476         if (!shader)
 477                 return;
 478
 479         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 480                 r600_delete_shader_selector(ctx, shader->sel);
 481         } else {
 482 #ifdef HAVE_OPENCL
 483                 radeon_shader_binary_clean(&shader->binary);
 484                 pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
 485                 pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
 486 #endif
 487                 r600_destroy_shader(&shader->bc);
 488         }
 489         FREE(shader);
 490 }
 491
 492 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 493 {
 494         struct r600_context *rctx = (struct r600_context *)ctx;
 495         struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
 496         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 497
 498         if (!state) {
 499                 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 500                 return;
 501         }
 502
 503         if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
 504                 bool compute_dirty;
 505
 506                 r600_shader_select(ctx, cstate->sel, &compute_dirty);
 507         }
 508
 509         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 510 }
 511
 512 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 513  * kernel parameters there are implicit parameters that need to be stored
 514  * in the vertex buffer as well.  Here is how these parameters are organized in
 515  * the buffer:
 516  *
 517  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 518  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 519  * DWORDS 6-8: Number of work items within each work group in each dimension
 520  *             (x,y,z)
 521  * DWORDS 9+ : Kernel parameters
 522  */
 523 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 524                                            const struct pipe_grid_info *info)
 525 {
 526         struct r600_context *rctx = (struct r600_context *)ctx;
 527         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 528         unsigned i;
 529         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 530          * parameters.
 531          */
 532         unsigned input_size;
 533         uint32_t *num_work_groups_start;
 534         uint32_t *global_size_start;
 535         uint32_t *local_size_start;
 536         uint32_t *kernel_parameters_start;
 537         struct pipe_box box;
 538         struct pipe_transfer *transfer = NULL;
 539
 540         if (!shader)
 541                 return;
 542         if (shader->input_size == 0) {
 543                 return;
 544         }
 545         input_size = shader->input_size + 36;
 546         if (!shader->kernel_param) {
 547                 /* Add space for the grid dimensions */
 548                 shader->kernel_param = (struct r600_resource *)
 549                         pipe_buffer_create(ctx->screen, 0,
 550                                         PIPE_USAGE_IMMUTABLE, input_size);
 551         }
 552
 553         u_box_1d(0, input_size, &box);
 554         num_work_groups_start = ctx->transfer_map(ctx,
 555                         (struct pipe_resource*)shader->kernel_param,
 556                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 557                         &box, &transfer);
 558         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 559         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 560         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 561
 562         /* Copy the work group size */
 563         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 564
 565         /* Copy the global size */
 566         for (i = 0; i < 3; i++) {
 567                 global_size_start[i] = info->grid[i] * info->block[i];
 568         }
 569
 570         /* Copy the local dimensions */
 571         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 572
 573         /* Copy the kernel inputs */
 574         memcpy(kernel_parameters_start, info->input, shader->input_size);
 575
 576         for (i = 0; i < (input_size / 4); i++) {
 577                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 578                         ((unsigned*)num_work_groups_start)[i]);
 579         }
 580
 581         ctx->transfer_unmap(ctx, transfer);
 582
 583         /* ID=0 and ID=3 are reserved for the parameters.
 584          * LLVM will preferably use ID=0, but it does not work for dynamic
 585          * indices. */
 586         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 587                         (struct pipe_resource*)shader->kernel_param);
 588         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 589                         (struct pipe_resource*)shader->kernel_param);
 590 }
 591
 592 static void evergreen_emit_dispatch(struct r600_context *rctx,
 593                                     const struct pipe_grid_info *info,
 594                                     uint32_t indirect_grid[3])
 595 {
 596         int i;
 597         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 598         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 599         bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 600         unsigned num_waves;
 601         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 602         unsigned wave_divisor = (16 * num_pipes);
 603         int group_size = 1;
 604         int grid_size = 1;
 605         unsigned lds_size = shader->local_size / 4;
 606
 607         if (shader->ir_type != PIPE_SHADER_IR_TGSI)
 608                 lds_size += shader->bc.nlds_dw;
 609
 610         /* Calculate group_size/grid_size */
 611         for (i = 0; i < 3; i++) {
 612                 group_size *= info->block[i];
 613         }
 614
 615         for (i = 0; i < 3; i++) {
 616                 grid_size *= info->grid[i];
 617         }
 618
 619         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 620         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 621                         wave_divisor - 1) / wave_divisor;
 622
 623         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 624                                 "%u wavefronts per thread block, "
 625                                 "allocating %u dwords lds.\n",
 626                                 num_pipes, num_waves, lds_size);
 627
 628         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 629
 630         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 631         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 632         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 633         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 634
 635         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 636                                                                 group_size);
 637
 638         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 639         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 640         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 641         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 642
 643         if (rctx->b.chip_class < CAYMAN) {
 644                 assert(lds_size <= 8192);
 645         } else {
 646                 /* Cayman appears to have a slightly smaller limit, see the
 647                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 648                 assert(lds_size <= 8160);
 649         }
 650
 651         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 652                                         lds_size | (num_waves << 14));
 653
 654         if (info->indirect) {
 655                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 656                 radeon_emit(cs, indirect_grid[0]);
 657                 radeon_emit(cs, indirect_grid[1]);
 658                 radeon_emit(cs, indirect_grid[2]);
 659                 radeon_emit(cs, 1);
 660         } else {
 661                 /* Dispatch packet */
 662                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 663                 radeon_emit(cs, info->grid[0]);
 664                 radeon_emit(cs, info->grid[1]);
 665                 radeon_emit(cs, info->grid[2]);
 666                 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 667                 radeon_emit(cs, 1);
 668         }
 669
 670         if (rctx->is_debug)
 671                 eg_trace_emit(rctx);
 672 }
 673
 674 static void compute_setup_cbs(struct r600_context *rctx)
 675 {
 676         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 677         unsigned i;
 678
 679         /* Emit colorbuffers. */
 680         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 681         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 682                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 683                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 684                                                        (struct r600_resource*)cb->base.texture,
 685                                                        RADEON_USAGE_READWRITE,
 686                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 687
 688                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 689                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 690                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 691                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 692                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 693                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 694                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 695                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 696
 697                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 698                 radeon_emit(cs, reloc);
 699
 700                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 701                 radeon_emit(cs, reloc);
 702         }
 703         for (; i < 8 ; i++)
 704                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 705                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 706         for (; i < 12; i++)
 707                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 708                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 709
 710         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 711         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 712                                        rctx->compute_cb_target_mask);
 713 }
 714
 715 static void compute_emit_cs(struct r600_context *rctx,
 716                             const struct pipe_grid_info *info)
 717 {
 718         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 719         bool compute_dirty = false;
 720         struct r600_pipe_shader *current;
 721         struct r600_shader_atomic combined_atomics[8];
 722         uint8_t atomic_used_mask;
 723         uint32_t indirect_grid[3] = { 0, 0, 0 };
 724
 725         /* make sure that the gfx ring is only one active */
 726         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 727                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 728         }
 729
 730         r600_update_compressed_resource_state(rctx, true);
 731
 732         if (!rctx->cmd_buf_is_compute) {
 733                 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 734                 rctx->cmd_buf_is_compute = true;
 735         }
 736
 737         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 738                 r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
 739                 current = rctx->cs_shader_state.shader->sel->current;
 740                 if (compute_dirty) {
 741                         rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
 742                         r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
 743                         r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
 744                 }
 745
 746                 bool need_buf_const = current->shader.uses_tex_buffers ||
 747                         current->shader.has_txq_cube_array_z_comp;
 748
 749                 if (info->indirect) {
 750                         struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
 751                         unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
 752                         unsigned offset = info->indirect_offset / 4;
 753                         indirect_grid[0] = data[offset];
 754                         indirect_grid[1] = data[offset + 1];
 755                         indirect_grid[2] = data[offset + 2];
 756                 }
 757                 for (int i = 0; i < 3; i++) {
 758                         rctx->cs_block_grid_sizes[i] = info->block[i];
 759                         rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
 760                 }
 761                 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 762                 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
 763
 764                 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
 765                 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
 766
 767                 if (need_buf_const) {
 768                         eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 769                 }
 770                 r600_update_driver_const_buffers(rctx, true);
 771
 772                 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
 773                 if (atomic_used_mask) {
 774                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 775                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 776                 }
 777         } else
 778                 r600_need_cs_space(rctx, 0, true, 0);
 779
 780         /* Initialize all the compute-related registers.
 781          *
 782          * See evergreen_init_atom_start_compute_cs() in this file for the list
 783          * of registers initialized by the start_compute_cs_cmd atom.
 784          */
 785         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 786
 787         /* emit config state */
 788         if (rctx->b.chip_class == EVERGREEN) {
 789                 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 790                         radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
 791                         radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
 792                         radeon_emit(cs, 0);
 793                         radeon_emit(cs, 0);
 794                         radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
 795                 } else
 796                         r600_emit_atom(rctx, &rctx->config_state.atom);
 797         }
 798
 799         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 800         r600_flush_emit(rctx);
 801
 802         if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
 803
 804                 compute_setup_cbs(rctx);
 805
 806                 /* Emit vertex buffer state */
 807                 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 808                 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 809         } else {
 810                 uint32_t rat_mask;
 811
 812                 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
 813                 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 814                                                rat_mask);
 815         }
 816
 817         r600_emit_atom(rctx, &rctx->b.render_cond_atom);
 818
 819         /* Emit constant buffer state */
 820         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 821
 822         /* Emit sampler state */
 823         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 824
 825         /* Emit sampler view (texture resource) state */
 826         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 827
 828         /* Emit images state */
 829         r600_emit_atom(rctx, &rctx->compute_images.atom);
 830
 831         /* Emit buffers state */
 832         r600_emit_atom(rctx, &rctx->compute_buffers.atom);
 833
 834         /* Emit shader state */
 835         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 836
 837         /* Emit dispatch state and dispatch packet */
 838         evergreen_emit_dispatch(rctx, info, indirect_grid);
 839
 840         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 841          */
 842         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 843                       R600_CONTEXT_INV_VERTEX_CACHE |
 844                       R600_CONTEXT_INV_TEX_CACHE;
 845         r600_flush_emit(rctx);
 846         rctx->b.flags = 0;
 847
 848         if (rctx->b.chip_class >= CAYMAN) {
 849                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 850                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 851                 /* DEALLOC_STATE prevents the GPU from hanging when a
 852                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 853                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 854                  */
 855                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 856                 radeon_emit(cs, 0);
 857         }
 858         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
 859                 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
 860
 861 #if 0
 862         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 863         for (i = 0; i < cs->cdw; i++) {
 864                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 865         }
 866 #endif
 867
 868 }
 869
 870
 871 /**
 872  * Emit function for r600_cs_shader_state atom
 873  */
 874 void evergreen_emit_cs_shader(struct r600_context *rctx,
 875                               struct r600_atom *atom)
 876 {
 877         struct r600_cs_shader_state *state =
 878                                         (struct r600_cs_shader_state*)atom;
 879         struct r600_pipe_compute *shader = state->shader;
 880         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 881         uint64_t va;
 882         struct r600_resource *code_bo;
 883         unsigned ngpr, nstack;
 884
 885         if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
 886                 code_bo = shader->sel->current->bo;
 887                 va = shader->sel->current->bo->gpu_address;
 888                 ngpr = shader->sel->current->shader.bc.ngpr;
 889                 nstack = shader->sel->current->shader.bc.nstack;
 890         } else {
 891                 code_bo = shader->code_bo;
 892                 va = shader->code_bo->gpu_address + state->pc;
 893                 ngpr = shader->bc.ngpr;
 894                 nstack = shader->bc.nstack;
 895         }
 896
 897         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 898         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 899         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 900                         S_0288D4_NUM_GPRS(ngpr) |
 901                         S_0288D4_DX10_CLAMP(1) |
 902                         S_0288D4_STACK_SIZE(nstack));
 903         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 904
 905         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 906         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 907                                               code_bo, RADEON_USAGE_READ,
 908                                               RADEON_PRIO_SHADER_BINARY));
 909 }
 910
 911 static void evergreen_launch_grid(struct pipe_context *ctx,
 912                                   const struct pipe_grid_info *info)
 913 {
 914         struct r600_context *rctx = (struct r600_context *)ctx;
 915 #ifdef HAVE_OPENCL
 916         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 917         boolean use_kill;
 918
 919         if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
 920                 rctx->cs_shader_state.pc = info->pc;
 921                 /* Get the config information for this kernel. */
 922                 r600_shader_binary_read_config(&shader->binary, &shader->bc,
 923                                                info->pc, &use_kill);
 924         } else {
 925                 use_kill = false;
 926                 rctx->cs_shader_state.pc = 0;
 927         }
 928 #endif
 929
 930         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 931
 932
 933         evergreen_compute_upload_input(ctx, info);
 934         compute_emit_cs(rctx, info);
 935 }
 936
 937 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 938                                             unsigned start, unsigned count,
 939                                             struct pipe_surface **surfaces)
 940 {
 941         struct r600_context *rctx = (struct r600_context *)ctx;
 942         struct r600_surface **resources = (struct r600_surface **)surfaces;
 943
 944         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 945                         start, count);
 946
 947         for (unsigned i = 0; i < count; i++) {
 948                 /* The First four vertex buffers are reserved for parameters and
 949                  * global buffers. */
 950                 unsigned vtx_id = 4 + i;
 951                 if (resources[i]) {
 952                         struct r600_resource_global *buffer =
 953                                 (struct r600_resource_global*)
 954                                 resources[i]->base.texture;
 955                         if (resources[i]->base.writable) {
 956                                 assert(i+1 < 12);
 957
 958                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 959                                 (struct r600_resource *)resources[i]->base.texture,
 960                                 buffer->chunk->start_in_dw*4,
 961                                 resources[i]->base.texture->width0);
 962                         }
 963
 964                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 965                                         buffer->chunk->start_in_dw * 4,
 966                                         resources[i]->base.texture);
 967                 }
 968         }
 969 }
 970
 971 static void evergreen_set_global_binding(struct pipe_context *ctx,
 972                                          unsigned first, unsigned n,
 973                                          struct pipe_resource **resources,
 974                                          uint32_t **handles)
 975 {
 976         struct r600_context *rctx = (struct r600_context *)ctx;
 977         struct compute_memory_pool *pool = rctx->screen->global_pool;
 978         struct r600_resource_global **buffers =
 979                 (struct r600_resource_global **)resources;
 980         unsigned i;
 981
 982         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 983                         first, n);
 984
 985         if (!resources) {
 986                 /* XXX: Unset */
 987                 return;
 988         }
 989
 990         /* We mark these items for promotion to the pool if they
 991          * aren't already there */
 992         for (i = first; i < first + n; i++) {
 993                 struct compute_memory_item *item = buffers[i]->chunk;
 994
 995                 if (!is_item_in_pool(item))
 996                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 997         }
 998
 999         if (compute_memory_finalize_pending(pool, ctx) == -1) {
1000                 /* XXX: Unset */
1001                 return;
1002         }
1003
1004         for (i = first; i < first + n; i++)
1005         {
1006                 uint32_t buffer_offset;
1007                 uint32_t handle;
1008                 assert(resources[i]->target == PIPE_BUFFER);
1009                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1010
1011                 buffer_offset = util_le32_to_cpu(*(handles[i]));
1012                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1013
1014                 *(handles[i]) = util_cpu_to_le32(handle);
1015         }
1016
1017         /* globals for writing */
1018         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1019         /* globals for reading */
1020         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1021                                 (struct pipe_resource*)pool->bo);
1022
1023         /* constants for reading, LLVM puts them in text segment */
1024         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1025                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1026 }
1027
1028 /**
1029  * This function initializes all the compute specific registers that need to
1030  * be initialized for each compute command stream.  Registers that are common
1031  * to both compute and 3D will be initialized at the beginning of each compute
1032  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1033  * packet requires that the shader type bit be set, we must initialize all
1034  * context registers needed for compute in this function.  The registers
1035  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1036  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1037  * on the GPU family.
1038  */
1039 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1040 {
1041         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1042         int num_threads;
1043         int num_stack_entries;
1044
1045         /* since all required registers are initialized in the
1046          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1047          */
1048         r600_init_command_buffer(cb, 256);
1049         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1050
1051         /* We're setting config registers here. */
1052         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1053         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1054
1055         switch (rctx->b.family) {
1056         case CHIP_CEDAR:
1057         default:
1058                 num_threads = 128;
1059                 num_stack_entries = 256;
1060                 break;
1061         case CHIP_REDWOOD:
1062                 num_threads = 128;
1063                 num_stack_entries = 256;
1064                 break;
1065         case CHIP_JUNIPER:
1066                 num_threads = 128;
1067                 num_stack_entries = 512;
1068                 break;
1069         case CHIP_CYPRESS:
1070         case CHIP_HEMLOCK:
1071                 num_threads = 128;
1072                 num_stack_entries = 512;
1073                 break;
1074         case CHIP_PALM:
1075                 num_threads = 128;
1076                 num_stack_entries = 256;
1077                 break;
1078         case CHIP_SUMO:
1079                 num_threads = 128;
1080                 num_stack_entries = 256;
1081                 break;
1082         case CHIP_SUMO2:
1083                 num_threads = 128;
1084                 num_stack_entries = 512;
1085                 break;
1086         case CHIP_BARTS:
1087                 num_threads = 128;
1088                 num_stack_entries = 512;
1089                 break;
1090         case CHIP_TURKS:
1091                 num_threads = 128;
1092                 num_stack_entries = 256;
1093                 break;
1094         case CHIP_CAICOS:
1095                 num_threads = 128;
1096                 num_stack_entries = 256;
1097                 break;
1098         }
1099
1100         /* The primitive type always needs to be POINTLIST for compute. */
1101         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1102                                                 V_008958_DI_PT_POINTLIST);
1103
1104         if (rctx->b.chip_class < CAYMAN) {
1105
1106                 /* These registers control which simds can be used by each stage.
1107                  * The default for these registers is 0xffffffff, which means
1108                  * all simds are available for each stage.  It's possible we may
1109                  * want to play around with these in the future, but for now
1110                  * the default value is fine.
1111                  *
1112                  * R_008E20_SQ_STATIC_THREAD_MGMT1
1113                  * R_008E24_SQ_STATIC_THREAD_MGMT2
1114                  * R_008E28_SQ_STATIC_THREAD_MGMT3
1115                  */
1116
1117                 /* XXX: We may need to adjust the thread and stack resource
1118                  * values for 3D/compute interop */
1119
1120                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1121
1122                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1123                  * Set the number of threads used by the PS/VS/GS/ES stage to
1124                  * 0.
1125                  */
1126                 r600_store_value(cb, 0);
1127
1128                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1129                  * Set the number of threads used by the CS (aka LS) stage to
1130                  * the maximum number of threads and set the number of threads
1131                  * for the HS stage to 0. */
1132                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1133
1134                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1135                  * Set the Control Flow stack entries to 0 for PS/VS stages */
1136                 r600_store_value(cb, 0);
1137
1138                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1139                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1140                 r600_store_value(cb, 0);
1141
1142                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1143                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1144                  * set it to the maximum value for the CS (aka LS) stage. */
1145                 r600_store_value(cb,
1146                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1147         }
1148         /* Give the compute shader all the available LDS space.
1149          * NOTE: This only sets the maximum number of dwords that a compute
1150          * shader can allocate.  When a shader is executed, we still need to
1151          * allocate the appropriate amount of LDS dwords using the
1152          * CM_R_0288E8_SQ_LDS_ALLOC register.
1153          */
1154         if (rctx->b.chip_class < CAYMAN) {
1155                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1156                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1157         } else {
1158                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1159                         S_0286FC_NUM_PS_LDS(0) |
1160                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1161         }
1162
1163         /* Context Registers */
1164
1165         if (rctx->b.chip_class < CAYMAN) {
1166                 /* workaround for hw issues with dyn gpr - must set all limits
1167                  * to 240 instead of 0, 0x1e == 240 / 8
1168                  */
1169                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1170                                 S_028838_PS_GPRS(0x1e) |
1171                                 S_028838_VS_GPRS(0x1e) |
1172                                 S_028838_GS_GPRS(0x1e) |
1173                                 S_028838_ES_GPRS(0x1e) |
1174                                 S_028838_HS_GPRS(0x1e) |
1175                                 S_028838_LS_GPRS(0x1e));
1176         }
1177
1178         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1179         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1180                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1181
1182         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1183
1184         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1185                                S_0286E8_TID_IN_GROUP_ENA(1) |
1186                                S_0286E8_TGID_ENA(1) |
1187                                S_0286E8_DISABLE_INDEX_PACK(1));
1188
1189         /* The LOOP_CONST registers are an optimizations for loops that allows
1190          * you to store the initial counter, increment value, and maximum
1191          * counter value in a register so that hardware can calculate the
1192          * correct number of iterations for the loop, so that you don't need
1193          * to have the loop counter in your shader code.  We don't currently use
1194          * this optimization, so we must keep track of the counter in the
1195          * shader and use a break instruction to exit loops.  However, the
1196          * hardware will still uses this register to determine when to exit a
1197          * loop, so we need to initialize the counter to 0, set the increment
1198          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1199          * is the maximum value allowed.  This gives us a maximum of 4096
1200          * iterations for our loops, but hopefully our break instruction will
1201          * execute before some time before the 4096th iteration.
1202          */
1203         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1204 }
1205
1206 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1207 {
1208         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1209         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1210         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1211 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1212         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1213         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1214         rctx->b.b.launch_grid = evergreen_launch_grid;
1215
1216 }
1217
1218 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1219                                               struct pipe_resource *resource,
1220                                               unsigned level,
1221                                               unsigned usage,
1222                                               const struct pipe_box *box,
1223                                               struct pipe_transfer **ptransfer)
1224 {
1225         struct r600_context *rctx = (struct r600_context*)ctx;
1226         struct compute_memory_pool *pool = rctx->screen->global_pool;
1227         struct r600_resource_global* buffer =
1228                 (struct r600_resource_global*)resource;
1229
1230         struct compute_memory_item *item = buffer->chunk;
1231         struct pipe_resource *dst = NULL;
1232         unsigned offset = box->x;
1233
1234         if (is_item_in_pool(item)) {
1235                 compute_memory_demote_item(pool, item, ctx);
1236         }
1237         else {
1238                 if (item->real_buffer == NULL) {
1239                         item->real_buffer =
1240                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1241                 }
1242         }
1243
1244         dst = (struct pipe_resource*)item->real_buffer;
1245
1246         if (usage & PIPE_TRANSFER_READ)
1247                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1248
1249         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1250                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1251                         "width = %u, height = %u, depth = %u)\n", level, usage,
1252                         box->x, box->y, box->z, box->width, box->height,
1253                         box->depth);
1254         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1255                 "%u (box.x)\n", item->id, box->x);
1256
1257
1258         assert(resource->target == PIPE_BUFFER);
1259         assert(resource->bind & PIPE_BIND_GLOBAL);
1260         assert(box->x >= 0);
1261         assert(box->y == 0);
1262         assert(box->z == 0);
1263
1264         ///TODO: do it better, mapping is not possible if the pool is too big
1265         return pipe_buffer_map_range(ctx, dst,
1266                         offset, box->width, usage, ptransfer);
1267 }
1268
1269 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1270                                                struct pipe_transfer *transfer)
1271 {
1272         /* struct r600_resource_global are not real resources, they just map
1273          * to an offset within the compute memory pool.  The function
1274          * r600_compute_global_transfer_map() maps the memory pool
1275          * resource rather than the struct r600_resource_global passed to
1276          * it as an argument and then initalizes ptransfer->resource with
1277          * the memory pool resource (via pipe_buffer_map_range).
1278          * When transfer_unmap is called it uses the memory pool's
1279          * vtable which calls r600_buffer_transfer_map() rather than
1280          * this function.
1281          */
1282         assert (!"This function should not be called");
1283 }
1284
1285 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1286                                                       struct pipe_transfer *transfer,
1287                                                       const struct pipe_box *box)
1288 {
1289         assert(0 && "TODO");
1290 }
1291
1292 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1293                                                struct pipe_resource *res)
1294 {
1295         struct r600_resource_global* buffer = NULL;
1296         struct r600_screen* rscreen = NULL;
1297
1298         assert(res->target == PIPE_BUFFER);
1299         assert(res->bind & PIPE_BIND_GLOBAL);
1300
1301         buffer = (struct r600_resource_global*)res;
1302         rscreen = (struct r600_screen*)screen;
1303
1304         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1305
1306         buffer->chunk = NULL;
1307         free(res);
1308 }
1309
1310 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1311 {
1312         u_default_resource_get_handle, /* get_handle */
1313         r600_compute_global_buffer_destroy, /* resource_destroy */
1314         r600_compute_global_transfer_map, /* transfer_map */
1315         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1316         r600_compute_global_transfer_unmap, /* transfer_unmap */
1317 };
1318
1319 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1320                                                         const struct pipe_resource *templ)
1321 {
1322         struct r600_resource_global* result = NULL;
1323         struct r600_screen* rscreen = NULL;
1324         int size_in_dw = 0;
1325
1326         assert(templ->target == PIPE_BUFFER);
1327         assert(templ->bind & PIPE_BIND_GLOBAL);
1328         assert(templ->array_size == 1 || templ->array_size == 0);
1329         assert(templ->depth0 == 1 || templ->depth0 == 0);
1330         assert(templ->height0 == 1 || templ->height0 == 0);
1331
1332         result = (struct r600_resource_global*)
1333         CALLOC(sizeof(struct r600_resource_global), 1);
1334         rscreen = (struct r600_screen*)screen;
1335
1336         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1337         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1338                         templ->array_size);
1339
1340         result->base.b.vtbl = &r600_global_buffer_vtbl;
1341         result->base.b.b = *templ;
1342         result->base.b.b.screen = screen;
1343         pipe_reference_init(&result->base.b.b.reference, 1);
1344
1345         size_in_dw = (templ->width0+3) / 4;
1346
1347         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1348
1349         if (result->chunk == NULL)
1350         {
1351                 free(result);
1352                 return NULL;
1353         }
1354
1355         return &result->base.b.b;
1356 }