src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "tgsi/tgsi_parse.h"
  45 #include "pipebuffer/pb_buffer.h"
  46 #include "evergreend.h"
  47 #include "r600_shader.h"
  48 #include "r600_pipe.h"
  49 #include "r600_formats.h"
  50 #include "evergreen_compute.h"
  51 #include "evergreen_compute_internal.h"
  52 #include "compute_memory_pool.h"
  53 #include "sb/sb_public.h"
  54 #include <inttypes.h>
  55
  56 /**
  57 RAT0 is for global binding write
  58 VTX1 is for global binding read
  59
  60 for wrting images RAT1...
  61 for reading images TEX2...
  62   TEX2-RAT1 is paired
  63
  64 TEX2... consumes the same fetch resources, that VTX2... would consume
  65
  66 CONST0 and VTX0 is for parameters
  67   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  68   also constant cached
  69   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  70   the constant cache can handle
  71
  72 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  73 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  74 we should reserve another one too.=> 10 image binding for writing max.
  75
  76 from Nvidia OpenCL:
  77   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  78   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  79
  80 so 10 for writing is enough. 176 is the max for reading according to the docs
  81
  82 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  83 writable images will consume TEX slots, VTX slots too because of linear indexing
  84
  85 */
  86
  87 #ifdef HAVE_OPENCL
  88 static void radeon_shader_binary_init(struct r600_shader_binary *b)
  89 {
  90         memset(b, 0, sizeof(*b));
  91 }
  92
  93 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
  94 {
  95         if (!b)
  96                 return;
  97         FREE(b->code);
  98         FREE(b->config);
  99         FREE(b->rodata);
 100         FREE(b->global_symbol_offsets);
 101         FREE(b->relocs);
 102         FREE(b->disasm_string);
 103 }
 104 #endif
 105
 106 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
 107                                                      unsigned size)
 108 {
 109         struct pipe_resource *buffer = NULL;
 110         assert(size);
 111
 112         buffer = pipe_buffer_create((struct pipe_screen*) screen,
 113                                     0, PIPE_USAGE_IMMUTABLE, size);
 114
 115         return (struct r600_resource *)buffer;
 116 }
 117
 118
 119 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 120                               unsigned id,
 121                               struct r600_resource *bo,
 122                               int start,
 123                               int size)
 124 {
 125         struct pipe_surface rat_templ;
 126         struct r600_surface *surf = NULL;
 127         struct r600_context *rctx = NULL;
 128
 129         assert(id < 12);
 130         assert((size & 3) == 0);
 131         assert((start & 0xFF) == 0);
 132
 133         rctx = pipe->ctx;
 134
 135         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 136
 137         /* Create the RAT surface */
 138         memset(&rat_templ, 0, sizeof(rat_templ));
 139         rat_templ.format = PIPE_FORMAT_R32_UINT;
 140         rat_templ.u.tex.level = 0;
 141         rat_templ.u.tex.first_layer = 0;
 142         rat_templ.u.tex.last_layer = 0;
 143
 144         /* Add the RAT the list of color buffers. Drop the old buffer first. */
 145         pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
 146         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 147                 (struct pipe_context *)pipe->ctx,
 148                 (struct pipe_resource *)bo, &rat_templ);
 149
 150         /* Update the number of color buffers */
 151         pipe->ctx->framebuffer.state.nr_cbufs =
 152                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 153
 154         /* Update the cb_target_mask
 155          * XXX: I think this is a potential spot for bugs once we start doing
 156          * GL interop.  cb_target_mask may be modified in the 3D sections
 157          * of this driver. */
 158         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 159
 160         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 161         evergreen_init_color_surface_rat(rctx, surf);
 162 }
 163
 164 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 165                                            unsigned vb_index,
 166                                            unsigned offset,
 167                                            struct pipe_resource *buffer)
 168 {
 169         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 170         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 171         vb->stride = 1;
 172         vb->buffer_offset = offset;
 173         vb->buffer.resource = buffer;
 174         vb->is_user_buffer = false;
 175
 176         /* The vertex instructions in the compute shaders use the texture cache,
 177          * so we need to invalidate it. */
 178         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 179         state->enabled_mask |= 1 << vb_index;
 180         state->dirty_mask |= 1 << vb_index;
 181         r600_mark_atom_dirty(rctx, &state->atom);
 182 }
 183
 184 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 185                                              unsigned cb_index,
 186                                              unsigned offset,
 187                                              unsigned size,
 188                                              struct pipe_resource *buffer)
 189 {
 190         struct pipe_constant_buffer cb;
 191         cb.buffer_size = size;
 192         cb.buffer_offset = offset;
 193         cb.buffer = buffer;
 194         cb.user_buffer = NULL;
 195
 196         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 197 }
 198
 199 /* We need to define these R600 registers here, because we can't include
 200  * evergreend.h and r600d.h.
 201  */
 202 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 203 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 204
 205 #ifdef HAVE_OPENCL
 206 static void parse_symbol_table(Elf_Data *symbol_table_data,
 207                                 const GElf_Shdr *symbol_table_header,
 208                                 struct r600_shader_binary *binary)
 209 {
 210         GElf_Sym symbol;
 211         unsigned i = 0;
 212         unsigned symbol_count =
 213                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 214
 215         /* We are over allocating this list, because symbol_count gives the
 216          * total number of symbols, and we will only be filling the list
 217          * with offsets of global symbols.  The memory savings from
 218          * allocating the correct size of this list will be small, and
 219          * I don't think it is worth the cost of pre-computing the number
 220          * of global symbols.
 221          */
 222         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 223
 224         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 225                 unsigned i;
 226                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 227                     symbol.st_shndx == 0 /* Undefined symbol */) {
 228                         continue;
 229                 }
 230
 231                 binary->global_symbol_offsets[binary->global_symbol_count] =
 232                                         symbol.st_value;
 233
 234                 /* Sort the list using bubble sort.  This list will usually
 235                  * be small. */
 236                 for (i = binary->global_symbol_count; i > 0; --i) {
 237                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 238                         uint64_t rhs = binary->global_symbol_offsets[i];
 239                         if (lhs < rhs) {
 240                                 break;
 241                         }
 242                         binary->global_symbol_offsets[i] = lhs;
 243                         binary->global_symbol_offsets[i - 1] = rhs;
 244                 }
 245                 ++binary->global_symbol_count;
 246         }
 247 }
 248
 249
 250 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 251                         unsigned symbol_sh_link,
 252                         struct r600_shader_binary *binary)
 253 {
 254         unsigned i;
 255
 256         if (!relocs || !symbols || !binary->reloc_count) {
 257                 return;
 258         }
 259         binary->relocs = CALLOC(binary->reloc_count,
 260                         sizeof(struct r600_shader_reloc));
 261         for (i = 0; i < binary->reloc_count; i++) {
 262                 GElf_Sym symbol;
 263                 GElf_Rel rel;
 264                 char *symbol_name;
 265                 struct r600_shader_reloc *reloc = &binary->relocs[i];
 266
 267                 gelf_getrel(relocs, i, &rel);
 268                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 269                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 270
 271                 reloc->offset = rel.r_offset;
 272                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 273                 reloc->name[sizeof(reloc->name)-1] = 0;
 274         }
 275 }
 276
 277 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 278                  struct r600_shader_binary *binary)
 279 {
 280         char *elf_buffer;
 281         Elf *elf;
 282         Elf_Scn *section = NULL;
 283         Elf_Data *symbols = NULL, *relocs = NULL;
 284         size_t section_str_index;
 285         unsigned symbol_sh_link = 0;
 286
 287         /* One of the libelf implementations
 288          * (http://www.mr511.de/software/english.htm) requires calling
 289          * elf_version() before elf_memory().
 290          */
 291         elf_version(EV_CURRENT);
 292         elf_buffer = MALLOC(elf_size);
 293         memcpy(elf_buffer, elf_data, elf_size);
 294
 295         elf = elf_memory(elf_buffer, elf_size);
 296
 297         elf_getshdrstrndx(elf, &section_str_index);
 298
 299         while ((section = elf_nextscn(elf, section))) {
 300                 const char *name;
 301                 Elf_Data *section_data = NULL;
 302                 GElf_Shdr section_header;
 303                 if (gelf_getshdr(section, &section_header) != &section_header) {
 304                         fprintf(stderr, "Failed to read ELF section header\n");
 305                         return;
 306                 }
 307                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 308                 if (!strcmp(name, ".text")) {
 309                         section_data = elf_getdata(section, section_data);
 310                         binary->code_size = section_data->d_size;
 311                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 312                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 313                 } else if (!strcmp(name, ".AMDGPU.config")) {
 314                         section_data = elf_getdata(section, section_data);
 315                         binary->config_size = section_data->d_size;
 316                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 317                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 318                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 319                         /* Always read disassembly if it's available. */
 320                         section_data = elf_getdata(section, section_data);
 321                         binary->disasm_string = strndup(section_data->d_buf,
 322                                                         section_data->d_size);
 323                 } else if (!strncmp(name, ".rodata", 7)) {
 324                         section_data = elf_getdata(section, section_data);
 325                         binary->rodata_size = section_data->d_size;
 326                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 327                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 328                 } else if (!strncmp(name, ".symtab", 7)) {
 329                         symbols = elf_getdata(section, section_data);
 330                         symbol_sh_link = section_header.sh_link;
 331                         parse_symbol_table(symbols, &section_header, binary);
 332                 } else if (!strcmp(name, ".rel.text")) {
 333                         relocs = elf_getdata(section, section_data);
 334                         binary->reloc_count = section_header.sh_size /
 335                                         section_header.sh_entsize;
 336                 }
 337         }
 338
 339         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 340
 341         if (elf){
 342                 elf_end(elf);
 343         }
 344         FREE(elf_buffer);
 345
 346         /* Cache the config size per symbol */
 347         if (binary->global_symbol_count) {
 348                 binary->config_size_per_symbol =
 349                         binary->config_size / binary->global_symbol_count;
 350         } else {
 351                 binary->global_symbol_count = 1;
 352                 binary->config_size_per_symbol = binary->config_size;
 353         }
 354 }
 355
 356 static const unsigned char *r600_shader_binary_config_start(
 357         const struct r600_shader_binary *binary,
 358         uint64_t symbol_offset)
 359 {
 360         unsigned i;
 361         for (i = 0; i < binary->global_symbol_count; ++i) {
 362                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 363                         unsigned offset = i * binary->config_size_per_symbol;
 364                         return binary->config + offset;
 365                 }
 366         }
 367         return binary->config;
 368 }
 369
 370 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
 371                                            struct r600_bytecode *bc,
 372                                            uint64_t symbol_offset,
 373                                            boolean *use_kill)
 374 {
 375        unsigned i;
 376        const unsigned char *config =
 377                r600_shader_binary_config_start(binary, symbol_offset);
 378
 379        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 380                unsigned reg =
 381                        util_le32_to_cpu(*(uint32_t*)(config + i));
 382                unsigned value =
 383                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 384                switch (reg) {
 385                /* R600 / R700 */
 386                case R_028850_SQ_PGM_RESOURCES_PS:
 387                case R_028868_SQ_PGM_RESOURCES_VS:
 388                /* Evergreen / Northern Islands */
 389                case R_028844_SQ_PGM_RESOURCES_PS:
 390                case R_028860_SQ_PGM_RESOURCES_VS:
 391                case R_0288D4_SQ_PGM_RESOURCES_LS:
 392                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 393                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 394                        break;
 395                case R_02880C_DB_SHADER_CONTROL:
 396                        *use_kill = G_02880C_KILL_ENABLE(value);
 397                        break;
 398                case R_0288E8_SQ_LDS_ALLOC:
 399                        bc->nlds_dw = value;
 400                        break;
 401                }
 402        }
 403 }
 404
 405 static unsigned r600_create_shader(struct r600_bytecode *bc,
 406                                    const struct r600_shader_binary *binary,
 407                                    boolean *use_kill)
 408
 409 {
 410         assert(binary->code_size % 4 == 0);
 411         bc->bytecode = CALLOC(1, binary->code_size);
 412         memcpy(bc->bytecode, binary->code, binary->code_size);
 413         bc->ndw = binary->code_size / 4;
 414
 415         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 416         return 0;
 417 }
 418
 419 #endif
 420
 421 static void r600_destroy_shader(struct r600_bytecode *bc)
 422 {
 423         FREE(bc->bytecode);
 424 }
 425
 426 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 427                                             const struct pipe_compute_state *cso)
 428 {
 429         struct r600_context *rctx = (struct r600_context *)ctx;
 430         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 431 #ifdef HAVE_OPENCL
 432         const struct pipe_binary_program_header *header;
 433         void *p;
 434         boolean use_kill;
 435 #endif
 436
 437         shader->ctx = rctx;
 438         shader->local_size = cso->req_local_mem;
 439         shader->private_size = cso->req_private_mem;
 440         shader->input_size = cso->req_input_mem;
 441
 442         shader->ir_type = cso->ir_type;
 443
 444         if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
 445             shader->ir_type == PIPE_SHADER_IR_NIR) {
 446                 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
 447                 return shader;
 448         }
 449 #ifdef HAVE_OPENCL
 450         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 451         header = cso->prog;
 452         radeon_shader_binary_init(&shader->binary);
 453         r600_elf_read(header->blob, header->num_bytes, &shader->binary);
 454         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 455
 456         /* Upload code + ROdata */
 457         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 458                                                         shader->bc.ndw * 4);
 459         p = r600_buffer_map_sync_with_rings(
 460                 &rctx->b, shader->code_bo,
 461                 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
 462         //TODO: use util_memcpy_cpu_to_le32 ?
 463         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 464         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 465 #endif
 466
 467         return shader;
 468 }
 469
 470 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 471 {
 472         struct r600_context *rctx = (struct r600_context *)ctx;
 473         struct r600_pipe_compute *shader = state;
 474
 475         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 476
 477         if (!shader)
 478                 return;
 479
 480         if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
 481             shader->ir_type == PIPE_SHADER_IR_NIR) {
 482                 r600_delete_shader_selector(ctx, shader->sel);
 483         } else {
 484 #ifdef HAVE_OPENCL
 485                 radeon_shader_binary_clean(&shader->binary);
 486                 pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
 487                 pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
 488 #endif
 489                 r600_destroy_shader(&shader->bc);
 490         }
 491         FREE(shader);
 492 }
 493
 494 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 495 {
 496         struct r600_context *rctx = (struct r600_context *)ctx;
 497         struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
 498         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 499
 500         if (!state) {
 501                 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 502                 return;
 503         }
 504
 505         if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
 506             cstate->ir_type == PIPE_SHADER_IR_NIR) {
 507                 bool compute_dirty;
 508                 cstate->sel->ir_type = cstate->ir_type;
 509                 if (r600_shader_select(ctx, cstate->sel, &compute_dirty))
 510                         R600_ERR("Failed to select compute shader\n");
 511         }
 512
 513         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 514 }
 515
 516 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 517  * kernel parameters there are implicit parameters that need to be stored
 518  * in the vertex buffer as well.  Here is how these parameters are organized in
 519  * the buffer:
 520  *
 521  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 522  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 523  * DWORDS 6-8: Number of work items within each work group in each dimension
 524  *             (x,y,z)
 525  * DWORDS 9+ : Kernel parameters
 526  */
 527 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 528                                            const struct pipe_grid_info *info)
 529 {
 530         struct r600_context *rctx = (struct r600_context *)ctx;
 531         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 532         unsigned i;
 533         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 534          * parameters.
 535          */
 536         unsigned input_size;
 537         uint32_t *num_work_groups_start;
 538         uint32_t *global_size_start;
 539         uint32_t *local_size_start;
 540         uint32_t *kernel_parameters_start;
 541         struct pipe_box box;
 542         struct pipe_transfer *transfer = NULL;
 543
 544         if (!shader)
 545                 return;
 546         if (shader->input_size == 0) {
 547                 return;
 548         }
 549         input_size = shader->input_size + 36;
 550         if (!shader->kernel_param) {
 551                 /* Add space for the grid dimensions */
 552                 shader->kernel_param = (struct r600_resource *)
 553                         pipe_buffer_create(ctx->screen, 0,
 554                                         PIPE_USAGE_IMMUTABLE, input_size);
 555         }
 556
 557         u_box_1d(0, input_size, &box);
 558         num_work_groups_start = ctx->transfer_map(ctx,
 559                         (struct pipe_resource*)shader->kernel_param,
 560                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 561                         &box, &transfer);
 562         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 563         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 564         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 565
 566         /* Copy the work group size */
 567         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 568
 569         /* Copy the global size */
 570         for (i = 0; i < 3; i++) {
 571                 global_size_start[i] = info->grid[i] * info->block[i];
 572         }
 573
 574         /* Copy the local dimensions */
 575         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 576
 577         /* Copy the kernel inputs */
 578         memcpy(kernel_parameters_start, info->input, shader->input_size);
 579
 580         for (i = 0; i < (input_size / 4); i++) {
 581                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 582                         ((unsigned*)num_work_groups_start)[i]);
 583         }
 584
 585         ctx->transfer_unmap(ctx, transfer);
 586
 587         /* ID=0 and ID=3 are reserved for the parameters.
 588          * LLVM will preferably use ID=0, but it does not work for dynamic
 589          * indices. */
 590         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 591                         (struct pipe_resource*)shader->kernel_param);
 592         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 593                         (struct pipe_resource*)shader->kernel_param);
 594 }
 595
 596 static void evergreen_emit_dispatch(struct r600_context *rctx,
 597                                     const struct pipe_grid_info *info,
 598                                     uint32_t indirect_grid[3])
 599 {
 600         int i;
 601         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 602         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 603         bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 604         unsigned num_waves;
 605         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 606         unsigned wave_divisor = (16 * num_pipes);
 607         int group_size = 1;
 608         int grid_size = 1;
 609         unsigned lds_size = shader->local_size / 4;
 610
 611         if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
 612             shader->ir_type != PIPE_SHADER_IR_NIR)
 613                 lds_size += shader->bc.nlds_dw;
 614
 615         /* Calculate group_size/grid_size */
 616         for (i = 0; i < 3; i++) {
 617                 group_size *= info->block[i];
 618         }
 619
 620         for (i = 0; i < 3; i++) {
 621                 grid_size *= info->grid[i];
 622         }
 623
 624         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 625         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 626                         wave_divisor - 1) / wave_divisor;
 627
 628         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 629                                 "%u wavefronts per thread block, "
 630                                 "allocating %u dwords lds.\n",
 631                                 num_pipes, num_waves, lds_size);
 632
 633         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 634
 635         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 636         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 637         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 638         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 639
 640         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 641                                                                 group_size);
 642
 643         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 644         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 645         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 646         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 647
 648         if (rctx->b.chip_class < CAYMAN) {
 649                 assert(lds_size <= 8192);
 650         } else {
 651                 /* Cayman appears to have a slightly smaller limit, see the
 652                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 653                 assert(lds_size <= 8160);
 654         }
 655
 656         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 657                                         lds_size | (num_waves << 14));
 658
 659         if (info->indirect) {
 660                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 661                 radeon_emit(cs, indirect_grid[0]);
 662                 radeon_emit(cs, indirect_grid[1]);
 663                 radeon_emit(cs, indirect_grid[2]);
 664                 radeon_emit(cs, 1);
 665         } else {
 666                 /* Dispatch packet */
 667                 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
 668                 radeon_emit(cs, info->grid[0]);
 669                 radeon_emit(cs, info->grid[1]);
 670                 radeon_emit(cs, info->grid[2]);
 671                 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 672                 radeon_emit(cs, 1);
 673         }
 674
 675         if (rctx->is_debug)
 676                 eg_trace_emit(rctx);
 677 }
 678
 679 static void compute_setup_cbs(struct r600_context *rctx)
 680 {
 681         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 682         unsigned i;
 683
 684         /* Emit colorbuffers. */
 685         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 686         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 687                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 688                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 689                                                        (struct r600_resource*)cb->base.texture,
 690                                                        RADEON_USAGE_READWRITE,
 691                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 692
 693                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 694                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 695                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 696                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 697                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 698                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 699                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 700                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 701
 702                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 703                 radeon_emit(cs, reloc);
 704
 705                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 706                 radeon_emit(cs, reloc);
 707         }
 708         for (; i < 8 ; i++)
 709                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 710                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 711         for (; i < 12; i++)
 712                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 713                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 714
 715         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 716         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 717                                        rctx->compute_cb_target_mask);
 718 }
 719
 720 static void compute_emit_cs(struct r600_context *rctx,
 721                             const struct pipe_grid_info *info)
 722 {
 723         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 724         bool compute_dirty = false;
 725         struct r600_pipe_shader *current;
 726         struct r600_shader_atomic combined_atomics[8];
 727         uint8_t atomic_used_mask;
 728         uint32_t indirect_grid[3] = { 0, 0, 0 };
 729
 730         /* make sure that the gfx ring is only one active */
 731         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 732                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 733         }
 734
 735         r600_update_compressed_resource_state(rctx, true);
 736
 737         if (!rctx->cmd_buf_is_compute) {
 738                 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 739                 rctx->cmd_buf_is_compute = true;
 740         }
 741
 742         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
 743             rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
 744                 if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) {
 745                         R600_ERR("Failed to select compute shader\n");
 746                         return;
 747                 }
 748
 749                 current = rctx->cs_shader_state.shader->sel->current;
 750                 if (compute_dirty) {
 751                         rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
 752                         r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
 753                         r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
 754                 }
 755
 756                 bool need_buf_const = current->shader.uses_tex_buffers ||
 757                         current->shader.has_txq_cube_array_z_comp;
 758
 759                 if (info->indirect) {
 760                         struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
 761                         unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
 762                         unsigned offset = info->indirect_offset / 4;
 763                         indirect_grid[0] = data[offset];
 764                         indirect_grid[1] = data[offset + 1];
 765                         indirect_grid[2] = data[offset + 2];
 766                 }
 767                 for (int i = 0; i < 3; i++) {
 768                         rctx->cs_block_grid_sizes[i] = info->block[i];
 769                         rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
 770                 }
 771                 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 772                 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
 773
 774                 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
 775                 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
 776
 777                 if (need_buf_const) {
 778                         eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 779                 }
 780                 r600_update_driver_const_buffers(rctx, true);
 781
 782                 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
 783                 if (atomic_used_mask) {
 784                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 785                         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 786                 }
 787         } else
 788                 r600_need_cs_space(rctx, 0, true, 0);
 789
 790         /* Initialize all the compute-related registers.
 791          *
 792          * See evergreen_init_atom_start_compute_cs() in this file for the list
 793          * of registers initialized by the start_compute_cs_cmd atom.
 794          */
 795         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 796
 797         /* emit config state */
 798         if (rctx->b.chip_class == EVERGREEN) {
 799                 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
 800                     rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
 801                         radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
 802                         radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
 803                         radeon_emit(cs, 0);
 804                         radeon_emit(cs, 0);
 805                         radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
 806                 } else
 807                         r600_emit_atom(rctx, &rctx->config_state.atom);
 808         }
 809
 810         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 811         r600_flush_emit(rctx);
 812
 813         if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
 814             rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
 815
 816                 compute_setup_cbs(rctx);
 817
 818                 /* Emit vertex buffer state */
 819                 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 820                 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 821         } else {
 822                 uint32_t rat_mask;
 823
 824                 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
 825                 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 826                                                rat_mask);
 827         }
 828
 829         r600_emit_atom(rctx, &rctx->b.render_cond_atom);
 830
 831         /* Emit constant buffer state */
 832         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 833
 834         /* Emit sampler state */
 835         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 836
 837         /* Emit sampler view (texture resource) state */
 838         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 839
 840         /* Emit images state */
 841         r600_emit_atom(rctx, &rctx->compute_images.atom);
 842
 843         /* Emit buffers state */
 844         r600_emit_atom(rctx, &rctx->compute_buffers.atom);
 845
 846         /* Emit shader state */
 847         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 848
 849         /* Emit dispatch state and dispatch packet */
 850         evergreen_emit_dispatch(rctx, info, indirect_grid);
 851
 852         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 853          */
 854         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 855                       R600_CONTEXT_INV_VERTEX_CACHE |
 856                       R600_CONTEXT_INV_TEX_CACHE;
 857         r600_flush_emit(rctx);
 858         rctx->b.flags = 0;
 859
 860         if (rctx->b.chip_class >= CAYMAN) {
 861                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 862                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 863                 /* DEALLOC_STATE prevents the GPU from hanging when a
 864                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 865                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 866                  */
 867                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 868                 radeon_emit(cs, 0);
 869         }
 870         if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
 871             rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
 872                 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
 873
 874 #if 0
 875         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 876         for (i = 0; i < cs->cdw; i++) {
 877                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 878         }
 879 #endif
 880
 881 }
 882
 883
 884 /**
 885  * Emit function for r600_cs_shader_state atom
 886  */
 887 void evergreen_emit_cs_shader(struct r600_context *rctx,
 888                               struct r600_atom *atom)
 889 {
 890         struct r600_cs_shader_state *state =
 891                                         (struct r600_cs_shader_state*)atom;
 892         struct r600_pipe_compute *shader = state->shader;
 893         struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 894         uint64_t va;
 895         struct r600_resource *code_bo;
 896         unsigned ngpr, nstack;
 897
 898         if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
 899             shader->ir_type == PIPE_SHADER_IR_NIR) {
 900                 code_bo = shader->sel->current->bo;
 901                 va = shader->sel->current->bo->gpu_address;
 902                 ngpr = shader->sel->current->shader.bc.ngpr;
 903                 nstack = shader->sel->current->shader.bc.nstack;
 904         } else {
 905                 code_bo = shader->code_bo;
 906                 va = shader->code_bo->gpu_address + state->pc;
 907                 ngpr = shader->bc.ngpr;
 908                 nstack = shader->bc.nstack;
 909         }
 910
 911         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 912         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 913         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 914                         S_0288D4_NUM_GPRS(ngpr) |
 915                         S_0288D4_DX10_CLAMP(1) |
 916                         S_0288D4_STACK_SIZE(nstack));
 917         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 918
 919         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 920         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 921                                               code_bo, RADEON_USAGE_READ,
 922                                               RADEON_PRIO_SHADER_BINARY));
 923 }
 924
 925 static void evergreen_launch_grid(struct pipe_context *ctx,
 926                                   const struct pipe_grid_info *info)
 927 {
 928         struct r600_context *rctx = (struct r600_context *)ctx;
 929 #ifdef HAVE_OPENCL
 930         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 931         boolean use_kill;
 932
 933         if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
 934             shader->ir_type != PIPE_SHADER_IR_NIR) {
 935                 rctx->cs_shader_state.pc = info->pc;
 936                 /* Get the config information for this kernel. */
 937                 r600_shader_binary_read_config(&shader->binary, &shader->bc,
 938                                                info->pc, &use_kill);
 939         } else {
 940                 use_kill = false;
 941                 rctx->cs_shader_state.pc = 0;
 942         }
 943 #endif
 944
 945         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 946
 947
 948         evergreen_compute_upload_input(ctx, info);
 949         compute_emit_cs(rctx, info);
 950 }
 951
 952 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 953                                             unsigned start, unsigned count,
 954                                             struct pipe_surface **surfaces)
 955 {
 956         struct r600_context *rctx = (struct r600_context *)ctx;
 957         struct r600_surface **resources = (struct r600_surface **)surfaces;
 958
 959         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 960                         start, count);
 961
 962         for (unsigned i = 0; i < count; i++) {
 963                 /* The First four vertex buffers are reserved for parameters and
 964                  * global buffers. */
 965                 unsigned vtx_id = 4 + i;
 966                 if (resources[i]) {
 967                         struct r600_resource_global *buffer =
 968                                 (struct r600_resource_global*)
 969                                 resources[i]->base.texture;
 970                         if (resources[i]->base.writable) {
 971                                 assert(i+1 < 12);
 972
 973                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 974                                 (struct r600_resource *)resources[i]->base.texture,
 975                                 buffer->chunk->start_in_dw*4,
 976                                 resources[i]->base.texture->width0);
 977                         }
 978
 979                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 980                                         buffer->chunk->start_in_dw * 4,
 981                                         resources[i]->base.texture);
 982                 }
 983         }
 984 }
 985
 986 static void evergreen_set_global_binding(struct pipe_context *ctx,
 987                                          unsigned first, unsigned n,
 988                                          struct pipe_resource **resources,
 989                                          uint32_t **handles)
 990 {
 991         struct r600_context *rctx = (struct r600_context *)ctx;
 992         struct compute_memory_pool *pool = rctx->screen->global_pool;
 993         struct r600_resource_global **buffers =
 994                 (struct r600_resource_global **)resources;
 995         unsigned i;
 996
 997         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 998                         first, n);
 999
1000         if (!resources) {
1001                 /* XXX: Unset */
1002                 return;
1003         }
1004
1005         /* We mark these items for promotion to the pool if they
1006          * aren't already there */
1007         for (i = first; i < first + n; i++) {
1008                 struct compute_memory_item *item = buffers[i]->chunk;
1009
1010                 if (!is_item_in_pool(item))
1011                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
1012         }
1013
1014         if (compute_memory_finalize_pending(pool, ctx) == -1) {
1015                 /* XXX: Unset */
1016                 return;
1017         }
1018
1019         for (i = first; i < first + n; i++)
1020         {
1021                 uint32_t buffer_offset;
1022                 uint32_t handle;
1023                 assert(resources[i]->target == PIPE_BUFFER);
1024                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1025
1026                 buffer_offset = util_le32_to_cpu(*(handles[i]));
1027                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1028
1029                 *(handles[i]) = util_cpu_to_le32(handle);
1030         }
1031
1032         /* globals for writing */
1033         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1034         /* globals for reading */
1035         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1036                                 (struct pipe_resource*)pool->bo);
1037
1038         /* constants for reading, LLVM puts them in text segment */
1039         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1040                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1041 }
1042
1043 /**
1044  * This function initializes all the compute specific registers that need to
1045  * be initialized for each compute command stream.  Registers that are common
1046  * to both compute and 3D will be initialized at the beginning of each compute
1047  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1048  * packet requires that the shader type bit be set, we must initialize all
1049  * context registers needed for compute in this function.  The registers
1050  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1051  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1052  * on the GPU family.
1053  */
1054 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1055 {
1056         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1057         int num_threads;
1058         int num_stack_entries;
1059
1060         /* since all required registers are initialized in the
1061          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1062          */
1063         r600_init_command_buffer(cb, 256);
1064         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1065
1066         /* We're setting config registers here. */
1067         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1068         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1069
1070         switch (rctx->b.family) {
1071         case CHIP_CEDAR:
1072         default:
1073                 num_threads = 128;
1074                 num_stack_entries = 256;
1075                 break;
1076         case CHIP_REDWOOD:
1077                 num_threads = 128;
1078                 num_stack_entries = 256;
1079                 break;
1080         case CHIP_JUNIPER:
1081                 num_threads = 128;
1082                 num_stack_entries = 512;
1083                 break;
1084         case CHIP_CYPRESS:
1085         case CHIP_HEMLOCK:
1086                 num_threads = 128;
1087                 num_stack_entries = 512;
1088                 break;
1089         case CHIP_PALM:
1090                 num_threads = 128;
1091                 num_stack_entries = 256;
1092                 break;
1093         case CHIP_SUMO:
1094                 num_threads = 128;
1095                 num_stack_entries = 256;
1096                 break;
1097         case CHIP_SUMO2:
1098                 num_threads = 128;
1099                 num_stack_entries = 512;
1100                 break;
1101         case CHIP_BARTS:
1102                 num_threads = 128;
1103                 num_stack_entries = 512;
1104                 break;
1105         case CHIP_TURKS:
1106                 num_threads = 128;
1107                 num_stack_entries = 256;
1108                 break;
1109         case CHIP_CAICOS:
1110                 num_threads = 128;
1111                 num_stack_entries = 256;
1112                 break;
1113         }
1114
1115         /* The primitive type always needs to be POINTLIST for compute. */
1116         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1117                                                 V_008958_DI_PT_POINTLIST);
1118
1119         if (rctx->b.chip_class < CAYMAN) {
1120
1121                 /* These registers control which simds can be used by each stage.
1122                  * The default for these registers is 0xffffffff, which means
1123                  * all simds are available for each stage.  It's possible we may
1124                  * want to play around with these in the future, but for now
1125                  * the default value is fine.
1126                  *
1127                  * R_008E20_SQ_STATIC_THREAD_MGMT1
1128                  * R_008E24_SQ_STATIC_THREAD_MGMT2
1129                  * R_008E28_SQ_STATIC_THREAD_MGMT3
1130                  */
1131
1132                 /* XXX: We may need to adjust the thread and stack resource
1133                  * values for 3D/compute interop */
1134
1135                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1136
1137                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1138                  * Set the number of threads used by the PS/VS/GS/ES stage to
1139                  * 0.
1140                  */
1141                 r600_store_value(cb, 0);
1142
1143                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1144                  * Set the number of threads used by the CS (aka LS) stage to
1145                  * the maximum number of threads and set the number of threads
1146                  * for the HS stage to 0. */
1147                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1148
1149                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1150                  * Set the Control Flow stack entries to 0 for PS/VS stages */
1151                 r600_store_value(cb, 0);
1152
1153                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1154                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1155                 r600_store_value(cb, 0);
1156
1157                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1158                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1159                  * set it to the maximum value for the CS (aka LS) stage. */
1160                 r600_store_value(cb,
1161                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1162         }
1163         /* Give the compute shader all the available LDS space.
1164          * NOTE: This only sets the maximum number of dwords that a compute
1165          * shader can allocate.  When a shader is executed, we still need to
1166          * allocate the appropriate amount of LDS dwords using the
1167          * CM_R_0288E8_SQ_LDS_ALLOC register.
1168          */
1169         if (rctx->b.chip_class < CAYMAN) {
1170                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1171                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1172         } else {
1173                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1174                         S_0286FC_NUM_PS_LDS(0) |
1175                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1176         }
1177
1178         /* Context Registers */
1179
1180         if (rctx->b.chip_class < CAYMAN) {
1181                 /* workaround for hw issues with dyn gpr - must set all limits
1182                  * to 240 instead of 0, 0x1e == 240 / 8
1183                  */
1184                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1185                                 S_028838_PS_GPRS(0x1e) |
1186                                 S_028838_VS_GPRS(0x1e) |
1187                                 S_028838_GS_GPRS(0x1e) |
1188                                 S_028838_ES_GPRS(0x1e) |
1189                                 S_028838_HS_GPRS(0x1e) |
1190                                 S_028838_LS_GPRS(0x1e));
1191         }
1192
1193         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1194         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1195                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1196
1197         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1198
1199         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1200                                S_0286E8_TID_IN_GROUP_ENA(1) |
1201                                S_0286E8_TGID_ENA(1) |
1202                                S_0286E8_DISABLE_INDEX_PACK(1));
1203
1204         /* The LOOP_CONST registers are an optimizations for loops that allows
1205          * you to store the initial counter, increment value, and maximum
1206          * counter value in a register so that hardware can calculate the
1207          * correct number of iterations for the loop, so that you don't need
1208          * to have the loop counter in your shader code.  We don't currently use
1209          * this optimization, so we must keep track of the counter in the
1210          * shader and use a break instruction to exit loops.  However, the
1211          * hardware will still uses this register to determine when to exit a
1212          * loop, so we need to initialize the counter to 0, set the increment
1213          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1214          * is the maximum value allowed.  This gives us a maximum of 4096
1215          * iterations for our loops, but hopefully our break instruction will
1216          * execute before some time before the 4096th iteration.
1217          */
1218         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1219 }
1220
1221 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1222 {
1223         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1224         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1225         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1226 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1227         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1228         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1229         rctx->b.b.launch_grid = evergreen_launch_grid;
1230
1231 }
1232
1233 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1234                                               struct pipe_resource *resource,
1235                                               unsigned level,
1236                                               unsigned usage,
1237                                               const struct pipe_box *box,
1238                                               struct pipe_transfer **ptransfer)
1239 {
1240         struct r600_context *rctx = (struct r600_context*)ctx;
1241         struct compute_memory_pool *pool = rctx->screen->global_pool;
1242         struct r600_resource_global* buffer =
1243                 (struct r600_resource_global*)resource;
1244
1245         struct compute_memory_item *item = buffer->chunk;
1246         struct pipe_resource *dst = NULL;
1247         unsigned offset = box->x;
1248
1249         if (is_item_in_pool(item)) {
1250                 compute_memory_demote_item(pool, item, ctx);
1251         }
1252         else {
1253                 if (item->real_buffer == NULL) {
1254                         item->real_buffer =
1255                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1256                 }
1257         }
1258
1259         dst = (struct pipe_resource*)item->real_buffer;
1260
1261         if (usage & PIPE_TRANSFER_READ)
1262                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1263
1264         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1265                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1266                         "width = %u, height = %u, depth = %u)\n", level, usage,
1267                         box->x, box->y, box->z, box->width, box->height,
1268                         box->depth);
1269         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1270                 "%u (box.x)\n", item->id, box->x);
1271
1272
1273         assert(resource->target == PIPE_BUFFER);
1274         assert(resource->bind & PIPE_BIND_GLOBAL);
1275         assert(box->x >= 0);
1276         assert(box->y == 0);
1277         assert(box->z == 0);
1278
1279         ///TODO: do it better, mapping is not possible if the pool is too big
1280         return pipe_buffer_map_range(ctx, dst,
1281                         offset, box->width, usage, ptransfer);
1282 }
1283
1284 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1285                                                struct pipe_transfer *transfer)
1286 {
1287         /* struct r600_resource_global are not real resources, they just map
1288          * to an offset within the compute memory pool.  The function
1289          * r600_compute_global_transfer_map() maps the memory pool
1290          * resource rather than the struct r600_resource_global passed to
1291          * it as an argument and then initalizes ptransfer->resource with
1292          * the memory pool resource (via pipe_buffer_map_range).
1293          * When transfer_unmap is called it uses the memory pool's
1294          * vtable which calls r600_buffer_transfer_map() rather than
1295          * this function.
1296          */
1297         assert (!"This function should not be called");
1298 }
1299
1300 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1301                                                       struct pipe_transfer *transfer,
1302                                                       const struct pipe_box *box)
1303 {
1304         assert(0 && "TODO");
1305 }
1306
1307 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1308                                                struct pipe_resource *res)
1309 {
1310         struct r600_resource_global* buffer = NULL;
1311         struct r600_screen* rscreen = NULL;
1312
1313         assert(res->target == PIPE_BUFFER);
1314         assert(res->bind & PIPE_BIND_GLOBAL);
1315
1316         buffer = (struct r600_resource_global*)res;
1317         rscreen = (struct r600_screen*)screen;
1318
1319         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1320
1321         buffer->chunk = NULL;
1322         free(res);
1323 }
1324
1325 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1326 {
1327         u_default_resource_get_handle, /* get_handle */
1328         r600_compute_global_buffer_destroy, /* resource_destroy */
1329         r600_compute_global_transfer_map, /* transfer_map */
1330         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1331         r600_compute_global_transfer_unmap, /* transfer_unmap */
1332 };
1333
1334 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1335                                                         const struct pipe_resource *templ)
1336 {
1337         struct r600_resource_global* result = NULL;
1338         struct r600_screen* rscreen = NULL;
1339         int size_in_dw = 0;
1340
1341         assert(templ->target == PIPE_BUFFER);
1342         assert(templ->bind & PIPE_BIND_GLOBAL);
1343         assert(templ->array_size == 1 || templ->array_size == 0);
1344         assert(templ->depth0 == 1 || templ->depth0 == 0);
1345         assert(templ->height0 == 1 || templ->height0 == 0);
1346
1347         result = (struct r600_resource_global*)
1348         CALLOC(sizeof(struct r600_resource_global), 1);
1349         rscreen = (struct r600_screen*)screen;
1350
1351         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1352         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1353                         templ->array_size);
1354
1355         result->base.b.vtbl = &r600_global_buffer_vtbl;
1356         result->base.b.b = *templ;
1357         result->base.b.b.screen = screen;
1358         pipe_reference_init(&result->base.b.b.reference, 1);
1359
1360         size_in_dw = (templ->width0+3) / 4;
1361
1362         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1363
1364         if (result->chunk == NULL)
1365         {
1366                 free(result);
1367                 return NULL;
1368         }
1369
1370         return &result->base.b.b;
1371 }