src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #ifdef HAVE_OPENCL
  28 #include <gelf.h>
  29 #include <libelf.h>
  30 #endif
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include "pipe/p_defines.h"
  34 #include "pipe/p_state.h"
  35 #include "pipe/p_context.h"
  36 #include "util/u_blitter.h"
  37 #include "util/list.h"
  38 #include "util/u_transfer.h"
  39 #include "util/u_surface.h"
  40 #include "util/u_pack_color.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_inlines.h"
  43 #include "util/u_framebuffer.h"
  44 #include "pipebuffer/pb_buffer.h"
  45 #include "evergreend.h"
  46 #include "r600_shader.h"
  47 #include "r600_pipe.h"
  48 #include "r600_formats.h"
  49 #include "evergreen_compute.h"
  50 #include "evergreen_compute_internal.h"
  51 #include "compute_memory_pool.h"
  52 #include "sb/sb_public.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  87                                                      unsigned size)
  88 {
  89         struct pipe_resource *buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  93                                     0, PIPE_USAGE_IMMUTABLE, size);
  94
  95         return (struct r600_resource *)buffer;
  96 }
  97
  98
  99 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 100                               unsigned id,
 101                               struct r600_resource *bo,
 102                               int start,
 103                               int size)
 104 {
 105         struct pipe_surface rat_templ;
 106         struct r600_surface *surf = NULL;
 107         struct r600_context *rctx = NULL;
 108
 109         assert(id < 12);
 110         assert((size & 3) == 0);
 111         assert((start & 0xFF) == 0);
 112
 113         rctx = pipe->ctx;
 114
 115         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 116
 117         /* Create the RAT surface */
 118         memset(&rat_templ, 0, sizeof(rat_templ));
 119         rat_templ.format = PIPE_FORMAT_R32_UINT;
 120         rat_templ.u.tex.level = 0;
 121         rat_templ.u.tex.first_layer = 0;
 122         rat_templ.u.tex.last_layer = 0;
 123
 124         /* Add the RAT the list of color buffers */
 125         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 126                 (struct pipe_context *)pipe->ctx,
 127                 (struct pipe_resource *)bo, &rat_templ);
 128
 129         /* Update the number of color buffers */
 130         pipe->ctx->framebuffer.state.nr_cbufs =
 131                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 132
 133         /* Update the cb_target_mask
 134          * XXX: I think this is a potential spot for bugs once we start doing
 135          * GL interop.  cb_target_mask may be modified in the 3D sections
 136          * of this driver. */
 137         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 138
 139         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 140         evergreen_init_color_surface_rat(rctx, surf);
 141 }
 142
 143 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 144                                            unsigned vb_index,
 145                                            unsigned offset,
 146                                            struct pipe_resource *buffer)
 147 {
 148         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 149         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 150         vb->stride = 1;
 151         vb->buffer_offset = offset;
 152         vb->buffer.resource = buffer;
 153         vb->is_user_buffer = false;
 154
 155         /* The vertex instructions in the compute shaders use the texture cache,
 156          * so we need to invalidate it. */
 157         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 158         state->enabled_mask |= 1 << vb_index;
 159         state->dirty_mask |= 1 << vb_index;
 160         r600_mark_atom_dirty(rctx, &state->atom);
 161 }
 162
 163 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 164                                              unsigned cb_index,
 165                                              unsigned offset,
 166                                              unsigned size,
 167                                              struct pipe_resource *buffer)
 168 {
 169         struct pipe_constant_buffer cb;
 170         cb.buffer_size = size;
 171         cb.buffer_offset = offset;
 172         cb.buffer = buffer;
 173         cb.user_buffer = NULL;
 174
 175         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 176 }
 177
 178 /* We need to define these R600 registers here, because we can't include
 179  * evergreend.h and r600d.h.
 180  */
 181 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 182 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 183
 184 #ifdef HAVE_OPENCL
 185 static void parse_symbol_table(Elf_Data *symbol_table_data,
 186                                 const GElf_Shdr *symbol_table_header,
 187                                 struct ac_shader_binary *binary)
 188 {
 189         GElf_Sym symbol;
 190         unsigned i = 0;
 191         unsigned symbol_count =
 192                 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
 193
 194         /* We are over allocating this list, because symbol_count gives the
 195          * total number of symbols, and we will only be filling the list
 196          * with offsets of global symbols.  The memory savings from
 197          * allocating the correct size of this list will be small, and
 198          * I don't think it is worth the cost of pre-computing the number
 199          * of global symbols.
 200          */
 201         binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
 202
 203         while (gelf_getsym(symbol_table_data, i++, &symbol)) {
 204                 unsigned i;
 205                 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
 206                     symbol.st_shndx == 0 /* Undefined symbol */) {
 207                         continue;
 208                 }
 209
 210                 binary->global_symbol_offsets[binary->global_symbol_count] =
 211                                         symbol.st_value;
 212
 213                 /* Sort the list using bubble sort.  This list will usually
 214                  * be small. */
 215                 for (i = binary->global_symbol_count; i > 0; --i) {
 216                         uint64_t lhs = binary->global_symbol_offsets[i - 1];
 217                         uint64_t rhs = binary->global_symbol_offsets[i];
 218                         if (lhs < rhs) {
 219                                 break;
 220                         }
 221                         binary->global_symbol_offsets[i] = lhs;
 222                         binary->global_symbol_offsets[i - 1] = rhs;
 223                 }
 224                 ++binary->global_symbol_count;
 225         }
 226 }
 227
 228
 229 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 230                         unsigned symbol_sh_link,
 231                         struct ac_shader_binary *binary)
 232 {
 233         unsigned i;
 234
 235         if (!relocs || !symbols || !binary->reloc_count) {
 236                 return;
 237         }
 238         binary->relocs = CALLOC(binary->reloc_count,
 239                         sizeof(struct ac_shader_reloc));
 240         for (i = 0; i < binary->reloc_count; i++) {
 241                 GElf_Sym symbol;
 242                 GElf_Rel rel;
 243                 char *symbol_name;
 244                 struct ac_shader_reloc *reloc = &binary->relocs[i];
 245
 246                 gelf_getrel(relocs, i, &rel);
 247                 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
 248                 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
 249
 250                 reloc->offset = rel.r_offset;
 251                 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
 252                 reloc->name[sizeof(reloc->name)-1] = 0;
 253         }
 254 }
 255
 256 static void r600_elf_read(const char *elf_data, unsigned elf_size,
 257                  struct ac_shader_binary *binary)
 258 {
 259         char *elf_buffer;
 260         Elf *elf;
 261         Elf_Scn *section = NULL;
 262         Elf_Data *symbols = NULL, *relocs = NULL;
 263         size_t section_str_index;
 264         unsigned symbol_sh_link = 0;
 265
 266         /* One of the libelf implementations
 267          * (http://www.mr511.de/software/english.htm) requires calling
 268          * elf_version() before elf_memory().
 269          */
 270         elf_version(EV_CURRENT);
 271         elf_buffer = MALLOC(elf_size);
 272         memcpy(elf_buffer, elf_data, elf_size);
 273
 274         elf = elf_memory(elf_buffer, elf_size);
 275
 276         elf_getshdrstrndx(elf, &section_str_index);
 277
 278         while ((section = elf_nextscn(elf, section))) {
 279                 const char *name;
 280                 Elf_Data *section_data = NULL;
 281                 GElf_Shdr section_header;
 282                 if (gelf_getshdr(section, &section_header) != &section_header) {
 283                         fprintf(stderr, "Failed to read ELF section header\n");
 284                         return;
 285                 }
 286                 name = elf_strptr(elf, section_str_index, section_header.sh_name);
 287                 if (!strcmp(name, ".text")) {
 288                         section_data = elf_getdata(section, section_data);
 289                         binary->code_size = section_data->d_size;
 290                         binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
 291                         memcpy(binary->code, section_data->d_buf, binary->code_size);
 292                 } else if (!strcmp(name, ".AMDGPU.config")) {
 293                         section_data = elf_getdata(section, section_data);
 294                         binary->config_size = section_data->d_size;
 295                         binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 296                         memcpy(binary->config, section_data->d_buf, binary->config_size);
 297                 } else if (!strcmp(name, ".AMDGPU.disasm")) {
 298                         /* Always read disassembly if it's available. */
 299                         section_data = elf_getdata(section, section_data);
 300                         binary->disasm_string = strndup(section_data->d_buf,
 301                                                         section_data->d_size);
 302                 } else if (!strncmp(name, ".rodata", 7)) {
 303                         section_data = elf_getdata(section, section_data);
 304                         binary->rodata_size = section_data->d_size;
 305                         binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
 306                         memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
 307                 } else if (!strncmp(name, ".symtab", 7)) {
 308                         symbols = elf_getdata(section, section_data);
 309                         symbol_sh_link = section_header.sh_link;
 310                         parse_symbol_table(symbols, &section_header, binary);
 311                 } else if (!strcmp(name, ".rel.text")) {
 312                         relocs = elf_getdata(section, section_data);
 313                         binary->reloc_count = section_header.sh_size /
 314                                         section_header.sh_entsize;
 315                 }
 316         }
 317
 318         parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
 319
 320         if (elf){
 321                 elf_end(elf);
 322         }
 323         FREE(elf_buffer);
 324
 325         /* Cache the config size per symbol */
 326         if (binary->global_symbol_count) {
 327                 binary->config_size_per_symbol =
 328                         binary->config_size / binary->global_symbol_count;
 329         } else {
 330                 binary->global_symbol_count = 1;
 331                 binary->config_size_per_symbol = binary->config_size;
 332         }
 333 }
 334
 335 static const unsigned char *r600_shader_binary_config_start(
 336         const struct ac_shader_binary *binary,
 337         uint64_t symbol_offset)
 338 {
 339         unsigned i;
 340         for (i = 0; i < binary->global_symbol_count; ++i) {
 341                 if (binary->global_symbol_offsets[i] == symbol_offset) {
 342                         unsigned offset = i * binary->config_size_per_symbol;
 343                         return binary->config + offset;
 344                 }
 345         }
 346         return binary->config;
 347 }
 348
 349 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 350                                            struct r600_bytecode *bc,
 351                                            uint64_t symbol_offset,
 352                                            boolean *use_kill)
 353 {
 354        unsigned i;
 355        const unsigned char *config =
 356                r600_shader_binary_config_start(binary, symbol_offset);
 357
 358        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 359                unsigned reg =
 360                        util_le32_to_cpu(*(uint32_t*)(config + i));
 361                unsigned value =
 362                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 363                switch (reg) {
 364                /* R600 / R700 */
 365                case R_028850_SQ_PGM_RESOURCES_PS:
 366                case R_028868_SQ_PGM_RESOURCES_VS:
 367                /* Evergreen / Northern Islands */
 368                case R_028844_SQ_PGM_RESOURCES_PS:
 369                case R_028860_SQ_PGM_RESOURCES_VS:
 370                case R_0288D4_SQ_PGM_RESOURCES_LS:
 371                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 372                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 373                        break;
 374                case R_02880C_DB_SHADER_CONTROL:
 375                        *use_kill = G_02880C_KILL_ENABLE(value);
 376                        break;
 377                case R_0288E8_SQ_LDS_ALLOC:
 378                        bc->nlds_dw = value;
 379                        break;
 380                }
 381        }
 382 }
 383
 384 static unsigned r600_create_shader(struct r600_bytecode *bc,
 385                                    const struct ac_shader_binary *binary,
 386                                    boolean *use_kill)
 387
 388 {
 389         assert(binary->code_size % 4 == 0);
 390         bc->bytecode = CALLOC(1, binary->code_size);
 391         memcpy(bc->bytecode, binary->code, binary->code_size);
 392         bc->ndw = binary->code_size / 4;
 393
 394         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 395         return 0;
 396 }
 397
 398 #endif
 399
 400 static void r600_destroy_shader(struct r600_bytecode *bc)
 401 {
 402         FREE(bc->bytecode);
 403 }
 404
 405 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 406                                             const struct pipe_compute_state *cso)
 407 {
 408         struct r600_context *rctx = (struct r600_context *)ctx;
 409         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 410 #ifdef HAVE_OPENCL
 411         const struct pipe_llvm_program_header *header;
 412         const char *code;
 413         void *p;
 414         boolean use_kill;
 415
 416         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 417         header = cso->prog;
 418         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 419         radeon_shader_binary_init(&shader->binary);
 420         r600_elf_read(code, header->num_bytes, &shader->binary);
 421         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 422
 423         /* Upload code + ROdata */
 424         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 425                                                         shader->bc.ndw * 4);
 426         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 427         //TODO: use util_memcpy_cpu_to_le32 ?
 428         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 429         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 430 #endif
 431
 432         shader->ctx = rctx;
 433         shader->local_size = cso->req_local_mem;
 434         shader->private_size = cso->req_private_mem;
 435         shader->input_size = cso->req_input_mem;
 436
 437         return shader;
 438 }
 439
 440 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 441 {
 442         struct r600_context *rctx = (struct r600_context *)ctx;
 443         struct r600_pipe_compute *shader = state;
 444
 445         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 446
 447         if (!shader)
 448                 return;
 449
 450 #ifdef HAVE_OPENCL
 451         radeon_shader_binary_clean(&shader->binary);
 452 #endif
 453         r600_destroy_shader(&shader->bc);
 454
 455         /* TODO destroy shader->code_bo, shader->const_bo
 456          * we'll need something like r600_buffer_free */
 457         FREE(shader);
 458 }
 459
 460 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 461 {
 462         struct r600_context *rctx = (struct r600_context *)ctx;
 463
 464         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 465
 466         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 467 }
 468
 469 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 470  * kernel parameters there are implicit parameters that need to be stored
 471  * in the vertex buffer as well.  Here is how these parameters are organized in
 472  * the buffer:
 473  *
 474  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 475  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 476  * DWORDS 6-8: Number of work items within each work group in each dimension
 477  *             (x,y,z)
 478  * DWORDS 9+ : Kernel parameters
 479  */
 480 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 481                                            const struct pipe_grid_info *info)
 482 {
 483         struct r600_context *rctx = (struct r600_context *)ctx;
 484         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 485         unsigned i;
 486         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 487          * parameters.
 488          */
 489         unsigned input_size = shader->input_size + 36;
 490         uint32_t *num_work_groups_start;
 491         uint32_t *global_size_start;
 492         uint32_t *local_size_start;
 493         uint32_t *kernel_parameters_start;
 494         struct pipe_box box;
 495         struct pipe_transfer *transfer = NULL;
 496
 497         if (shader->input_size == 0) {
 498                 return;
 499         }
 500
 501         if (!shader->kernel_param) {
 502                 /* Add space for the grid dimensions */
 503                 shader->kernel_param = (struct r600_resource *)
 504                         pipe_buffer_create(ctx->screen, 0,
 505                                         PIPE_USAGE_IMMUTABLE, input_size);
 506         }
 507
 508         u_box_1d(0, input_size, &box);
 509         num_work_groups_start = ctx->transfer_map(ctx,
 510                         (struct pipe_resource*)shader->kernel_param,
 511                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 512                         &box, &transfer);
 513         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 514         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 515         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 516
 517         /* Copy the work group size */
 518         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 519
 520         /* Copy the global size */
 521         for (i = 0; i < 3; i++) {
 522                 global_size_start[i] = info->grid[i] * info->block[i];
 523         }
 524
 525         /* Copy the local dimensions */
 526         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 527
 528         /* Copy the kernel inputs */
 529         memcpy(kernel_parameters_start, info->input, shader->input_size);
 530
 531         for (i = 0; i < (input_size / 4); i++) {
 532                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 533                         ((unsigned*)num_work_groups_start)[i]);
 534         }
 535
 536         ctx->transfer_unmap(ctx, transfer);
 537
 538         /* ID=0 and ID=3 are reserved for the parameters.
 539          * LLVM will preferably use ID=0, but it does not work for dynamic
 540          * indices. */
 541         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 542                         (struct pipe_resource*)shader->kernel_param);
 543         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 544                         (struct pipe_resource*)shader->kernel_param);
 545 }
 546
 547 static void evergreen_emit_dispatch(struct r600_context *rctx,
 548                                     const struct pipe_grid_info *info)
 549 {
 550         int i;
 551         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 552         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 553         unsigned num_waves;
 554         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 555         unsigned wave_divisor = (16 * num_pipes);
 556         int group_size = 1;
 557         int grid_size = 1;
 558         unsigned lds_size = shader->local_size / 4 +
 559                 shader->bc.nlds_dw;
 560
 561
 562         /* Calculate group_size/grid_size */
 563         for (i = 0; i < 3; i++) {
 564                 group_size *= info->block[i];
 565         }
 566
 567         for (i = 0; i < 3; i++) {
 568                 grid_size *= info->grid[i];
 569         }
 570
 571         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 572         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 573                         wave_divisor - 1) / wave_divisor;
 574
 575         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 576                                 "%u wavefronts per thread block, "
 577                                 "allocating %u dwords lds.\n",
 578                                 num_pipes, num_waves, lds_size);
 579
 580         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 581
 582         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 583         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 584         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 585         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 586
 587         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 588                                                                 group_size);
 589
 590         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 591         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 592         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 593         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 594
 595         if (rctx->b.chip_class < CAYMAN) {
 596                 assert(lds_size <= 8192);
 597         } else {
 598                 /* Cayman appears to have a slightly smaller limit, see the
 599                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 600                 assert(lds_size <= 8160);
 601         }
 602
 603         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 604                                         lds_size | (num_waves << 14));
 605
 606         /* Dispatch packet */
 607         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 608         radeon_emit(cs, info->grid[0]);
 609         radeon_emit(cs, info->grid[1]);
 610         radeon_emit(cs, info->grid[2]);
 611         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 612         radeon_emit(cs, 1);
 613
 614         if (rctx->is_debug)
 615                 eg_trace_emit(rctx);
 616 }
 617
 618 static void compute_setup_cbs(struct r600_context *rctx)
 619 {
 620         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 621         unsigned i;
 622
 623         /* Emit colorbuffers. */
 624         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 625         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 626                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 627                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 628                                                        (struct r600_resource*)cb->base.texture,
 629                                                        RADEON_USAGE_READWRITE,
 630                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 631
 632                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 633                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 634                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 635                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 636                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 637                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 638                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 639                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 640
 641                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 642                 radeon_emit(cs, reloc);
 643
 644                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 645                 radeon_emit(cs, reloc);
 646         }
 647         for (; i < 8 ; i++)
 648                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 649                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 650         for (; i < 12; i++)
 651                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 652                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 653
 654         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 655         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 656                                        rctx->compute_cb_target_mask);
 657 }
 658
 659 static void compute_emit_cs(struct r600_context *rctx,
 660                             const struct pipe_grid_info *info)
 661 {
 662         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 663
 664         /* make sure that the gfx ring is only one active */
 665         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 666                 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
 667         }
 668
 669         /* Initialize all the compute-related registers.
 670          *
 671          * See evergreen_init_atom_start_compute_cs() in this file for the list
 672          * of registers initialized by the start_compute_cs_cmd atom.
 673          */
 674         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 675
 676         /* emit config state */
 677         if (rctx->b.chip_class == EVERGREEN)
 678                 r600_emit_atom(rctx, &rctx->config_state.atom);
 679
 680         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 681         r600_flush_emit(rctx);
 682
 683         compute_setup_cbs(rctx);
 684
 685         /* Emit vertex buffer state */
 686         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 687         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 688
 689         /* Emit constant buffer state */
 690         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 691
 692         /* Emit sampler state */
 693         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 694
 695         /* Emit sampler view (texture resource) state */
 696         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 697
 698         /* Emit compute shader state */
 699         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 700
 701         /* Emit dispatch state and dispatch packet */
 702         evergreen_emit_dispatch(rctx, info);
 703
 704         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 705          */
 706         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 707                       R600_CONTEXT_INV_VERTEX_CACHE |
 708                       R600_CONTEXT_INV_TEX_CACHE;
 709         r600_flush_emit(rctx);
 710         rctx->b.flags = 0;
 711
 712         if (rctx->b.chip_class >= CAYMAN) {
 713                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 714                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 715                 /* DEALLOC_STATE prevents the GPU from hanging when a
 716                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 717                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 718                  */
 719                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 720                 radeon_emit(cs, 0);
 721         }
 722
 723 #if 0
 724         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 725         for (i = 0; i < cs->cdw; i++) {
 726                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 727         }
 728 #endif
 729
 730 }
 731
 732
 733 /**
 734  * Emit function for r600_cs_shader_state atom
 735  */
 736 void evergreen_emit_cs_shader(struct r600_context *rctx,
 737                               struct r600_atom *atom)
 738 {
 739         struct r600_cs_shader_state *state =
 740                                         (struct r600_cs_shader_state*)atom;
 741         struct r600_pipe_compute *shader = state->shader;
 742         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 743         uint64_t va;
 744         struct r600_resource *code_bo;
 745         unsigned ngpr, nstack;
 746
 747         code_bo = shader->code_bo;
 748         va = shader->code_bo->gpu_address + state->pc;
 749         ngpr = shader->bc.ngpr;
 750         nstack = shader->bc.nstack;
 751
 752         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 753         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 754         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 755                         S_0288D4_NUM_GPRS(ngpr) |
 756                         S_0288D4_DX10_CLAMP(1) |
 757                         S_0288D4_STACK_SIZE(nstack));
 758         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 759
 760         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 761         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 762                                               code_bo, RADEON_USAGE_READ,
 763                                               RADEON_PRIO_SHADER_BINARY));
 764 }
 765
 766 static void evergreen_launch_grid(struct pipe_context *ctx,
 767                                   const struct pipe_grid_info *info)
 768 {
 769         struct r600_context *rctx = (struct r600_context *)ctx;
 770 #ifdef HAVE_OPENCL
 771         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 772         boolean use_kill;
 773
 774         rctx->cs_shader_state.pc = info->pc;
 775         /* Get the config information for this kernel. */
 776         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 777                                   info->pc, &use_kill);
 778 #endif
 779
 780         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 781
 782
 783         evergreen_compute_upload_input(ctx, info);
 784         compute_emit_cs(rctx, info);
 785 }
 786
 787 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 788                                             unsigned start, unsigned count,
 789                                             struct pipe_surface **surfaces)
 790 {
 791         struct r600_context *rctx = (struct r600_context *)ctx;
 792         struct r600_surface **resources = (struct r600_surface **)surfaces;
 793
 794         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 795                         start, count);
 796
 797         for (unsigned i = 0; i < count; i++) {
 798                 /* The First four vertex buffers are reserved for parameters and
 799                  * global buffers. */
 800                 unsigned vtx_id = 4 + i;
 801                 if (resources[i]) {
 802                         struct r600_resource_global *buffer =
 803                                 (struct r600_resource_global*)
 804                                 resources[i]->base.texture;
 805                         if (resources[i]->base.writable) {
 806                                 assert(i+1 < 12);
 807
 808                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 809                                 (struct r600_resource *)resources[i]->base.texture,
 810                                 buffer->chunk->start_in_dw*4,
 811                                 resources[i]->base.texture->width0);
 812                         }
 813
 814                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 815                                         buffer->chunk->start_in_dw * 4,
 816                                         resources[i]->base.texture);
 817                 }
 818         }
 819 }
 820
 821 static void evergreen_set_global_binding(struct pipe_context *ctx,
 822                                          unsigned first, unsigned n,
 823                                          struct pipe_resource **resources,
 824                                          uint32_t **handles)
 825 {
 826         struct r600_context *rctx = (struct r600_context *)ctx;
 827         struct compute_memory_pool *pool = rctx->screen->global_pool;
 828         struct r600_resource_global **buffers =
 829                 (struct r600_resource_global **)resources;
 830         unsigned i;
 831
 832         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 833                         first, n);
 834
 835         if (!resources) {
 836                 /* XXX: Unset */
 837                 return;
 838         }
 839
 840         /* We mark these items for promotion to the pool if they
 841          * aren't already there */
 842         for (i = first; i < first + n; i++) {
 843                 struct compute_memory_item *item = buffers[i]->chunk;
 844
 845                 if (!is_item_in_pool(item))
 846                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 847         }
 848
 849         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 850                 /* XXX: Unset */
 851                 return;
 852         }
 853
 854         for (i = first; i < first + n; i++)
 855         {
 856                 uint32_t buffer_offset;
 857                 uint32_t handle;
 858                 assert(resources[i]->target == PIPE_BUFFER);
 859                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 860
 861                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 862                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 863
 864                 *(handles[i]) = util_cpu_to_le32(handle);
 865         }
 866
 867         /* globals for writing */
 868         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 869         /* globals for reading */
 870         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 871                                 (struct pipe_resource*)pool->bo);
 872
 873         /* constants for reading, LLVM puts them in text segment */
 874         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 875                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 876 }
 877
 878 /**
 879  * This function initializes all the compute specific registers that need to
 880  * be initialized for each compute command stream.  Registers that are common
 881  * to both compute and 3D will be initialized at the beginning of each compute
 882  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 883  * packet requires that the shader type bit be set, we must initialize all
 884  * context registers needed for compute in this function.  The registers
 885  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 886  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 887  * on the GPU family.
 888  */
 889 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 890 {
 891         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 892         int num_threads;
 893         int num_stack_entries;
 894
 895         /* since all required registers are initialized in the
 896          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 897          */
 898         r600_init_command_buffer(cb, 256);
 899         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 900
 901         /* This must be first. */
 902         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 903         r600_store_value(cb, 0x80000000);
 904         r600_store_value(cb, 0x80000000);
 905
 906         /* We're setting config registers here. */
 907         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 908         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 909
 910         switch (rctx->b.family) {
 911         case CHIP_CEDAR:
 912         default:
 913                 num_threads = 128;
 914                 num_stack_entries = 256;
 915                 break;
 916         case CHIP_REDWOOD:
 917                 num_threads = 128;
 918                 num_stack_entries = 256;
 919                 break;
 920         case CHIP_JUNIPER:
 921                 num_threads = 128;
 922                 num_stack_entries = 512;
 923                 break;
 924         case CHIP_CYPRESS:
 925         case CHIP_HEMLOCK:
 926                 num_threads = 128;
 927                 num_stack_entries = 512;
 928                 break;
 929         case CHIP_PALM:
 930                 num_threads = 128;
 931                 num_stack_entries = 256;
 932                 break;
 933         case CHIP_SUMO:
 934                 num_threads = 128;
 935                 num_stack_entries = 256;
 936                 break;
 937         case CHIP_SUMO2:
 938                 num_threads = 128;
 939                 num_stack_entries = 512;
 940                 break;
 941         case CHIP_BARTS:
 942                 num_threads = 128;
 943                 num_stack_entries = 512;
 944                 break;
 945         case CHIP_TURKS:
 946                 num_threads = 128;
 947                 num_stack_entries = 256;
 948                 break;
 949         case CHIP_CAICOS:
 950                 num_threads = 128;
 951                 num_stack_entries = 256;
 952                 break;
 953         }
 954
 955         /* Config Registers */
 956         if (rctx->b.chip_class < CAYMAN)
 957                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 958                                            rctx->screen->b.info.drm_minor);
 959         else
 960                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 961                                         rctx->screen->b.info.drm_minor);
 962
 963         /* The primitive type always needs to be POINTLIST for compute. */
 964         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 965                                                 V_008958_DI_PT_POINTLIST);
 966
 967         if (rctx->b.chip_class < CAYMAN) {
 968
 969                 /* These registers control which simds can be used by each stage.
 970                  * The default for these registers is 0xffffffff, which means
 971                  * all simds are available for each stage.  It's possible we may
 972                  * want to play around with these in the future, but for now
 973                  * the default value is fine.
 974                  *
 975                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 976                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 977                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 978                  */
 979
 980                 /* XXX: We may need to adjust the thread and stack resource
 981                  * values for 3D/compute interop */
 982
 983                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 984
 985                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 986                  * Set the number of threads used by the PS/VS/GS/ES stage to
 987                  * 0.
 988                  */
 989                 r600_store_value(cb, 0);
 990
 991                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 992                  * Set the number of threads used by the CS (aka LS) stage to
 993                  * the maximum number of threads and set the number of threads
 994                  * for the HS stage to 0. */
 995                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 996
 997                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 998                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 999                 r600_store_value(cb, 0);
1000
1001                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1002                  * Set the Control Flow stack entries to 0 for GS/ES stages */
1003                 r600_store_value(cb, 0);
1004
1005                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1006                  * Set the Contol Flow stack entries to 0 for the HS stage, and
1007                  * set it to the maximum value for the CS (aka LS) stage. */
1008                 r600_store_value(cb,
1009                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1010         }
1011         /* Give the compute shader all the available LDS space.
1012          * NOTE: This only sets the maximum number of dwords that a compute
1013          * shader can allocate.  When a shader is executed, we still need to
1014          * allocate the appropriate amount of LDS dwords using the
1015          * CM_R_0288E8_SQ_LDS_ALLOC register.
1016          */
1017         if (rctx->b.chip_class < CAYMAN) {
1018                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1019                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1020         } else {
1021                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1022                         S_0286FC_NUM_PS_LDS(0) |
1023                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1024         }
1025
1026         /* Context Registers */
1027
1028         if (rctx->b.chip_class < CAYMAN) {
1029                 /* workaround for hw issues with dyn gpr - must set all limits
1030                  * to 240 instead of 0, 0x1e == 240 / 8
1031                  */
1032                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1033                                 S_028838_PS_GPRS(0x1e) |
1034                                 S_028838_VS_GPRS(0x1e) |
1035                                 S_028838_GS_GPRS(0x1e) |
1036                                 S_028838_ES_GPRS(0x1e) |
1037                                 S_028838_HS_GPRS(0x1e) |
1038                                 S_028838_LS_GPRS(0x1e));
1039         }
1040
1041         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1042         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1043                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1044
1045         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1046
1047         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1048                                S_0286E8_TID_IN_GROUP_ENA(1) |
1049                                S_0286E8_TGID_ENA(1) |
1050                                S_0286E8_DISABLE_INDEX_PACK(1));
1051
1052         /* The LOOP_CONST registers are an optimizations for loops that allows
1053          * you to store the initial counter, increment value, and maximum
1054          * counter value in a register so that hardware can calculate the
1055          * correct number of iterations for the loop, so that you don't need
1056          * to have the loop counter in your shader code.  We don't currently use
1057          * this optimization, so we must keep track of the counter in the
1058          * shader and use a break instruction to exit loops.  However, the
1059          * hardware will still uses this register to determine when to exit a
1060          * loop, so we need to initialize the counter to 0, set the increment
1061          * value to 1 and the maximum counter value to the 4095 (0xfff) which
1062          * is the maximum value allowed.  This gives us a maximum of 4096
1063          * iterations for our loops, but hopefully our break instruction will
1064          * execute before some time before the 4096th iteration.
1065          */
1066         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1067 }
1068
1069 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1070 {
1071         rctx->b.b.create_compute_state = evergreen_create_compute_state;
1072         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1073         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1074 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1075         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1076         rctx->b.b.set_global_binding = evergreen_set_global_binding;
1077         rctx->b.b.launch_grid = evergreen_launch_grid;
1078
1079 }
1080
1081 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1082                                               struct pipe_resource *resource,
1083                                               unsigned level,
1084                                               unsigned usage,
1085                                               const struct pipe_box *box,
1086                                               struct pipe_transfer **ptransfer)
1087 {
1088         struct r600_context *rctx = (struct r600_context*)ctx;
1089         struct compute_memory_pool *pool = rctx->screen->global_pool;
1090         struct r600_resource_global* buffer =
1091                 (struct r600_resource_global*)resource;
1092
1093         struct compute_memory_item *item = buffer->chunk;
1094         struct pipe_resource *dst = NULL;
1095         unsigned offset = box->x;
1096
1097         if (is_item_in_pool(item)) {
1098                 compute_memory_demote_item(pool, item, ctx);
1099         }
1100         else {
1101                 if (item->real_buffer == NULL) {
1102                         item->real_buffer =
1103                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1104                 }
1105         }
1106
1107         dst = (struct pipe_resource*)item->real_buffer;
1108
1109         if (usage & PIPE_TRANSFER_READ)
1110                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1111
1112         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1113                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1114                         "width = %u, height = %u, depth = %u)\n", level, usage,
1115                         box->x, box->y, box->z, box->width, box->height,
1116                         box->depth);
1117         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1118                 "%u (box.x)\n", item->id, box->x);
1119
1120
1121         assert(resource->target == PIPE_BUFFER);
1122         assert(resource->bind & PIPE_BIND_GLOBAL);
1123         assert(box->x >= 0);
1124         assert(box->y == 0);
1125         assert(box->z == 0);
1126
1127         ///TODO: do it better, mapping is not possible if the pool is too big
1128         return pipe_buffer_map_range(ctx, dst,
1129                         offset, box->width, usage, ptransfer);
1130 }
1131
1132 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1133                                                struct pipe_transfer *transfer)
1134 {
1135         /* struct r600_resource_global are not real resources, they just map
1136          * to an offset within the compute memory pool.  The function
1137          * r600_compute_global_transfer_map() maps the memory pool
1138          * resource rather than the struct r600_resource_global passed to
1139          * it as an argument and then initalizes ptransfer->resource with
1140          * the memory pool resource (via pipe_buffer_map_range).
1141          * When transfer_unmap is called it uses the memory pool's
1142          * vtable which calls r600_buffer_transfer_map() rather than
1143          * this function.
1144          */
1145         assert (!"This function should not be called");
1146 }
1147
1148 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1149                                                       struct pipe_transfer *transfer,
1150                                                       const struct pipe_box *box)
1151 {
1152         assert(0 && "TODO");
1153 }
1154
1155 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1156                                                struct pipe_resource *res)
1157 {
1158         struct r600_resource_global* buffer = NULL;
1159         struct r600_screen* rscreen = NULL;
1160
1161         assert(res->target == PIPE_BUFFER);
1162         assert(res->bind & PIPE_BIND_GLOBAL);
1163
1164         buffer = (struct r600_resource_global*)res;
1165         rscreen = (struct r600_screen*)screen;
1166
1167         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1168
1169         buffer->chunk = NULL;
1170         free(res);
1171 }
1172
1173 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1174 {
1175         u_default_resource_get_handle, /* get_handle */
1176         r600_compute_global_buffer_destroy, /* resource_destroy */
1177         r600_compute_global_transfer_map, /* transfer_map */
1178         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1179         r600_compute_global_transfer_unmap, /* transfer_unmap */
1180 };
1181
1182 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1183                                                         const struct pipe_resource *templ)
1184 {
1185         struct r600_resource_global* result = NULL;
1186         struct r600_screen* rscreen = NULL;
1187         int size_in_dw = 0;
1188
1189         assert(templ->target == PIPE_BUFFER);
1190         assert(templ->bind & PIPE_BIND_GLOBAL);
1191         assert(templ->array_size == 1 || templ->array_size == 0);
1192         assert(templ->depth0 == 1 || templ->depth0 == 0);
1193         assert(templ->height0 == 1 || templ->height0 == 0);
1194
1195         result = (struct r600_resource_global*)
1196         CALLOC(sizeof(struct r600_resource_global), 1);
1197         rscreen = (struct r600_screen*)screen;
1198
1199         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1200         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1201                         templ->array_size);
1202
1203         result->base.b.vtbl = &r600_global_buffer_vtbl;
1204         result->base.b.b = *templ;
1205         result->base.b.b.screen = screen;
1206         pipe_reference_init(&result->base.b.b.reference, 1);
1207
1208         size_in_dw = (templ->width0+3) / 4;
1209
1210         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1211
1212         if (result->chunk == NULL)
1213         {
1214                 free(result);
1215                 return NULL;
1216         }
1217
1218         return &result->base.b.b;
1219 }