src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #include "radeon/radeon_elf_util.h"
  50 #include <inttypes.h>
  51
  52 /**
  53 RAT0 is for global binding write
  54 VTX1 is for global binding read
  55
  56 for wrting images RAT1...
  57 for reading images TEX2...
  58   TEX2-RAT1 is paired
  59
  60 TEX2... consumes the same fetch resources, that VTX2... would consume
  61
  62 CONST0 and VTX0 is for parameters
  63   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  64   also constant cached
  65   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  66   the constant cache can handle
  67
  68 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  69 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  70 we should reserve another one too.=> 10 image binding for writing max.
  71
  72 from Nvidia OpenCL:
  73   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  74   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  75
  76 so 10 for writing is enough. 176 is the max for reading according to the docs
  77
  78 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  79 writable images will consume TEX slots, VTX slots too because of linear indexing
  80
  81 */
  82
  83 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  84                                                      unsigned size)
  85 {
  86         struct pipe_resource *buffer = NULL;
  87         assert(size);
  88
  89         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  90                                     PIPE_BIND_CUSTOM,
  91                                     PIPE_USAGE_IMMUTABLE,
  92                                     size);
  93
  94         return (struct r600_resource *)buffer;
  95 }
  96
  97
  98 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
  99                               unsigned id,
 100                               struct r600_resource *bo,
 101                               int start,
 102                               int size)
 103 {
 104         struct pipe_surface rat_templ;
 105         struct r600_surface *surf = NULL;
 106         struct r600_context *rctx = NULL;
 107
 108         assert(id < 12);
 109         assert((size & 3) == 0);
 110         assert((start & 0xFF) == 0);
 111
 112         rctx = pipe->ctx;
 113
 114         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 115
 116         /* Create the RAT surface */
 117         memset(&rat_templ, 0, sizeof(rat_templ));
 118         rat_templ.format = PIPE_FORMAT_R32_UINT;
 119         rat_templ.u.tex.level = 0;
 120         rat_templ.u.tex.first_layer = 0;
 121         rat_templ.u.tex.last_layer = 0;
 122
 123         /* Add the RAT the list of color buffers */
 124         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 125                 (struct pipe_context *)pipe->ctx,
 126                 (struct pipe_resource *)bo, &rat_templ);
 127
 128         /* Update the number of color buffers */
 129         pipe->ctx->framebuffer.state.nr_cbufs =
 130                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 131
 132         /* Update the cb_target_mask
 133          * XXX: I think this is a potential spot for bugs once we start doing
 134          * GL interop.  cb_target_mask may be modified in the 3D sections
 135          * of this driver. */
 136         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 137
 138         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 139         evergreen_init_color_surface_rat(rctx, surf);
 140 }
 141
 142 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 143                                            unsigned vb_index,
 144                                            unsigned offset,
 145                                            struct pipe_resource *buffer)
 146 {
 147         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 148         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 149         vb->stride = 1;
 150         vb->buffer_offset = offset;
 151         vb->buffer = buffer;
 152         vb->user_buffer = NULL;
 153
 154         /* The vertex instructions in the compute shaders use the texture cache,
 155          * so we need to invalidate it. */
 156         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 157         state->enabled_mask |= 1 << vb_index;
 158         state->dirty_mask |= 1 << vb_index;
 159         r600_mark_atom_dirty(rctx, &state->atom);
 160 }
 161
 162 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 163                                              unsigned cb_index,
 164                                              unsigned offset,
 165                                              unsigned size,
 166                                              struct pipe_resource *buffer)
 167 {
 168         struct pipe_constant_buffer cb;
 169         cb.buffer_size = size;
 170         cb.buffer_offset = offset;
 171         cb.buffer = buffer;
 172         cb.user_buffer = NULL;
 173
 174         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 175 }
 176
 177 /* We need to define these R600 registers here, because we can't include
 178  * evergreend.h and r600d.h.
 179  */
 180 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 181 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 182
 183 #ifdef HAVE_OPENCL
 184
 185 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
 186                                            struct r600_bytecode *bc,
 187                                            uint64_t symbol_offset,
 188                                            boolean *use_kill)
 189 {
 190        unsigned i;
 191        const unsigned char *config =
 192                radeon_shader_binary_config_start(binary, symbol_offset);
 193
 194        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 195                unsigned reg =
 196                        util_le32_to_cpu(*(uint32_t*)(config + i));
 197                unsigned value =
 198                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 199                switch (reg) {
 200                /* R600 / R700 */
 201                case R_028850_SQ_PGM_RESOURCES_PS:
 202                case R_028868_SQ_PGM_RESOURCES_VS:
 203                /* Evergreen / Northern Islands */
 204                case R_028844_SQ_PGM_RESOURCES_PS:
 205                case R_028860_SQ_PGM_RESOURCES_VS:
 206                case R_0288D4_SQ_PGM_RESOURCES_LS:
 207                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 208                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 209                        break;
 210                case R_02880C_DB_SHADER_CONTROL:
 211                        *use_kill = G_02880C_KILL_ENABLE(value);
 212                        break;
 213                case R_0288E8_SQ_LDS_ALLOC:
 214                        bc->nlds_dw = value;
 215                        break;
 216                }
 217        }
 218 }
 219
 220 static unsigned r600_create_shader(struct r600_bytecode *bc,
 221                                    const struct radeon_shader_binary *binary,
 222                                    boolean *use_kill)
 223
 224 {
 225         assert(binary->code_size % 4 == 0);
 226         bc->bytecode = CALLOC(1, binary->code_size);
 227         memcpy(bc->bytecode, binary->code, binary->code_size);
 228         bc->ndw = binary->code_size / 4;
 229
 230         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 231         return 0;
 232 }
 233
 234 #endif
 235
 236 static void r600_destroy_shader(struct r600_bytecode *bc)
 237 {
 238         FREE(bc->bytecode);
 239 }
 240
 241 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 242                                             const struct pipe_compute_state *cso)
 243 {
 244         struct r600_context *rctx = (struct r600_context *)ctx;
 245         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 246 #ifdef HAVE_OPENCL
 247         const struct pipe_llvm_program_header *header;
 248         const char *code;
 249         void *p;
 250         boolean use_kill;
 251
 252         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 253         header = cso->prog;
 254         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 255         radeon_shader_binary_init(&shader->binary);
 256         radeon_elf_read(code, header->num_bytes, &shader->binary);
 257         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 258
 259         /* Upload code + ROdata */
 260         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 261                                                         shader->bc.ndw * 4);
 262         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 263         //TODO: use util_memcpy_cpu_to_le32 ?
 264         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 265         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 266 #endif
 267
 268         shader->ctx = rctx;
 269         shader->local_size = cso->req_local_mem;
 270         shader->private_size = cso->req_private_mem;
 271         shader->input_size = cso->req_input_mem;
 272
 273         return shader;
 274 }
 275
 276 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 277 {
 278         struct r600_context *rctx = (struct r600_context *)ctx;
 279         struct r600_pipe_compute *shader = state;
 280
 281         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 282
 283         if (!shader)
 284                 return;
 285
 286         radeon_shader_binary_clean(&shader->binary);
 287         r600_destroy_shader(&shader->bc);
 288
 289         /* TODO destroy shader->code_bo, shader->const_bo
 290          * we'll need something like r600_buffer_free */
 291         FREE(shader);
 292 }
 293
 294 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 295 {
 296         struct r600_context *rctx = (struct r600_context *)ctx;
 297
 298         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 299
 300         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 301 }
 302
 303 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 304  * kernel parameters there are implicit parameters that need to be stored
 305  * in the vertex buffer as well.  Here is how these parameters are organized in
 306  * the buffer:
 307  *
 308  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 309  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 310  * DWORDS 6-8: Number of work items within each work group in each dimension
 311  *             (x,y,z)
 312  * DWORDS 9+ : Kernel parameters
 313  */
 314 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 315                                            const struct pipe_grid_info *info)
 316 {
 317         struct r600_context *rctx = (struct r600_context *)ctx;
 318         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 319         unsigned i;
 320         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 321          * parameters.
 322          */
 323         unsigned input_size = shader->input_size + 36;
 324         uint32_t *num_work_groups_start;
 325         uint32_t *global_size_start;
 326         uint32_t *local_size_start;
 327         uint32_t *kernel_parameters_start;
 328         struct pipe_box box;
 329         struct pipe_transfer *transfer = NULL;
 330
 331         if (shader->input_size == 0) {
 332                 return;
 333         }
 334
 335         if (!shader->kernel_param) {
 336                 /* Add space for the grid dimensions */
 337                 shader->kernel_param = (struct r600_resource *)
 338                         pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
 339                                         PIPE_USAGE_IMMUTABLE, input_size);
 340         }
 341
 342         u_box_1d(0, input_size, &box);
 343         num_work_groups_start = ctx->transfer_map(ctx,
 344                         (struct pipe_resource*)shader->kernel_param,
 345                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 346                         &box, &transfer);
 347         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 348         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 349         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 350
 351         /* Copy the work group size */
 352         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 353
 354         /* Copy the global size */
 355         for (i = 0; i < 3; i++) {
 356                 global_size_start[i] = info->grid[i] * info->block[i];
 357         }
 358
 359         /* Copy the local dimensions */
 360         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 361
 362         /* Copy the kernel inputs */
 363         memcpy(kernel_parameters_start, info->input, shader->input_size);
 364
 365         for (i = 0; i < (input_size / 4); i++) {
 366                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 367                         ((unsigned*)num_work_groups_start)[i]);
 368         }
 369
 370         ctx->transfer_unmap(ctx, transfer);
 371
 372         /* ID=0 and ID=3 are reserved for the parameters.
 373          * LLVM will preferably use ID=0, but it does not work for dynamic
 374          * indices. */
 375         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 376                         (struct pipe_resource*)shader->kernel_param);
 377         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 378                         (struct pipe_resource*)shader->kernel_param);
 379 }
 380
 381 static void evergreen_emit_dispatch(struct r600_context *rctx,
 382                                     const struct pipe_grid_info *info)
 383 {
 384         int i;
 385         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 386         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 387         unsigned num_waves;
 388         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 389         unsigned wave_divisor = (16 * num_pipes);
 390         int group_size = 1;
 391         int grid_size = 1;
 392         unsigned lds_size = shader->local_size / 4 +
 393                 shader->bc.nlds_dw;
 394
 395
 396         /* Calculate group_size/grid_size */
 397         for (i = 0; i < 3; i++) {
 398                 group_size *= info->block[i];
 399         }
 400
 401         for (i = 0; i < 3; i++) {
 402                 grid_size *= info->grid[i];
 403         }
 404
 405         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 406         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 407                         wave_divisor - 1) / wave_divisor;
 408
 409         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 410                                 "%u wavefronts per thread block, "
 411                                 "allocating %u dwords lds.\n",
 412                                 num_pipes, num_waves, lds_size);
 413
 414         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 415
 416         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 417         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 418         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 419         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 420
 421         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 422                                                                 group_size);
 423
 424         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 425         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 426         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 427         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 428
 429         if (rctx->b.chip_class < CAYMAN) {
 430                 assert(lds_size <= 8192);
 431         } else {
 432                 /* Cayman appears to have a slightly smaller limit, see the
 433                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 434                 assert(lds_size <= 8160);
 435         }
 436
 437         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 438                                         lds_size | (num_waves << 14));
 439
 440         /* Dispatch packet */
 441         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 442         radeon_emit(cs, info->grid[0]);
 443         radeon_emit(cs, info->grid[1]);
 444         radeon_emit(cs, info->grid[2]);
 445         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 446         radeon_emit(cs, 1);
 447 }
 448
 449 static void compute_emit_cs(struct r600_context *rctx,
 450                             const struct pipe_grid_info *info)
 451 {
 452         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 453         unsigned i;
 454
 455         /* make sure that the gfx ring is only one active */
 456         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 457                 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 458         }
 459
 460         /* Initialize all the compute-related registers.
 461          *
 462          * See evergreen_init_atom_start_compute_cs() in this file for the list
 463          * of registers initialized by the start_compute_cs_cmd atom.
 464          */
 465         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 466
 467         /* emit config state */
 468         if (rctx->b.chip_class == EVERGREEN)
 469                 r600_emit_atom(rctx, &rctx->config_state.atom);
 470
 471         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 472         r600_flush_emit(rctx);
 473
 474         /* Emit colorbuffers. */
 475         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 476         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 477                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 478                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 479                                                        (struct r600_resource*)cb->base.texture,
 480                                                        RADEON_USAGE_READWRITE,
 481                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 482
 483                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 484                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 485                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 486                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 487                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 488                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 489                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 490                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 491
 492                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 493                 radeon_emit(cs, reloc);
 494
 495                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 496                 radeon_emit(cs, reloc);
 497         }
 498         for (; i < 8 ; i++)
 499                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 500                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 501         for (; i < 12; i++)
 502                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 503                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 504
 505         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 506         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 507                                         rctx->compute_cb_target_mask);
 508
 509
 510         /* Emit vertex buffer state */
 511         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 512         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 513
 514         /* Emit constant buffer state */
 515         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 516
 517         /* Emit sampler state */
 518         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 519
 520         /* Emit sampler view (texture resource) state */
 521         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 522
 523         /* Emit compute shader state */
 524         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 525
 526         /* Emit dispatch state and dispatch packet */
 527         evergreen_emit_dispatch(rctx, info);
 528
 529         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 530          */
 531         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 532                       R600_CONTEXT_INV_VERTEX_CACHE |
 533                       R600_CONTEXT_INV_TEX_CACHE;
 534         r600_flush_emit(rctx);
 535         rctx->b.flags = 0;
 536
 537         if (rctx->b.chip_class >= CAYMAN) {
 538                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 539                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 540                 /* DEALLOC_STATE prevents the GPU from hanging when a
 541                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 542                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 543                  */
 544                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 545                 radeon_emit(cs, 0);
 546         }
 547
 548 #if 0
 549         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 550         for (i = 0; i < cs->cdw; i++) {
 551                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 552         }
 553 #endif
 554
 555 }
 556
 557
 558 /**
 559  * Emit function for r600_cs_shader_state atom
 560  */
 561 void evergreen_emit_cs_shader(struct r600_context *rctx,
 562                               struct r600_atom *atom)
 563 {
 564         struct r600_cs_shader_state *state =
 565                                         (struct r600_cs_shader_state*)atom;
 566         struct r600_pipe_compute *shader = state->shader;
 567         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 568         uint64_t va;
 569         struct r600_resource *code_bo;
 570         unsigned ngpr, nstack;
 571
 572         code_bo = shader->code_bo;
 573         va = shader->code_bo->gpu_address + state->pc;
 574         ngpr = shader->bc.ngpr;
 575         nstack = shader->bc.nstack;
 576
 577         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 578         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 579         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 580                         S_0288D4_NUM_GPRS(ngpr)
 581                         | S_0288D4_STACK_SIZE(nstack));
 582         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 583
 584         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 585         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 586                                               code_bo, RADEON_USAGE_READ,
 587                                               RADEON_PRIO_USER_SHADER));
 588 }
 589
 590 static void evergreen_launch_grid(struct pipe_context *ctx,
 591                                   const struct pipe_grid_info *info)
 592 {
 593         struct r600_context *rctx = (struct r600_context *)ctx;
 594 #ifdef HAVE_OPENCL
 595         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 596         boolean use_kill;
 597
 598         rctx->cs_shader_state.pc = info->pc;
 599         /* Get the config information for this kernel. */
 600         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 601                                   info->pc, &use_kill);
 602 #endif
 603
 604         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 605
 606
 607         evergreen_compute_upload_input(ctx, info);
 608         compute_emit_cs(rctx, info);
 609 }
 610
 611 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 612                                             unsigned start, unsigned count,
 613                                             struct pipe_surface **surfaces)
 614 {
 615         struct r600_context *rctx = (struct r600_context *)ctx;
 616         struct r600_surface **resources = (struct r600_surface **)surfaces;
 617
 618         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 619                         start, count);
 620
 621         for (unsigned i = 0; i < count; i++) {
 622                 /* The First four vertex buffers are reserved for parameters and
 623                  * global buffers. */
 624                 unsigned vtx_id = 4 + i;
 625                 if (resources[i]) {
 626                         struct r600_resource_global *buffer =
 627                                 (struct r600_resource_global*)
 628                                 resources[i]->base.texture;
 629                         if (resources[i]->base.writable) {
 630                                 assert(i+1 < 12);
 631
 632                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 633                                 (struct r600_resource *)resources[i]->base.texture,
 634                                 buffer->chunk->start_in_dw*4,
 635                                 resources[i]->base.texture->width0);
 636                         }
 637
 638                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 639                                         buffer->chunk->start_in_dw * 4,
 640                                         resources[i]->base.texture);
 641                 }
 642         }
 643 }
 644
 645 static void evergreen_set_global_binding(struct pipe_context *ctx,
 646                                          unsigned first, unsigned n,
 647                                          struct pipe_resource **resources,
 648                                          uint32_t **handles)
 649 {
 650         struct r600_context *rctx = (struct r600_context *)ctx;
 651         struct compute_memory_pool *pool = rctx->screen->global_pool;
 652         struct r600_resource_global **buffers =
 653                 (struct r600_resource_global **)resources;
 654         unsigned i;
 655
 656         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 657                         first, n);
 658
 659         if (!resources) {
 660                 /* XXX: Unset */
 661                 return;
 662         }
 663
 664         /* We mark these items for promotion to the pool if they
 665          * aren't already there */
 666         for (i = first; i < first + n; i++) {
 667                 struct compute_memory_item *item = buffers[i]->chunk;
 668
 669                 if (!is_item_in_pool(item))
 670                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 671         }
 672
 673         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 674                 /* XXX: Unset */
 675                 return;
 676         }
 677
 678         for (i = first; i < first + n; i++)
 679         {
 680                 uint32_t buffer_offset;
 681                 uint32_t handle;
 682                 assert(resources[i]->target == PIPE_BUFFER);
 683                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 684
 685                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 686                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 687
 688                 *(handles[i]) = util_cpu_to_le32(handle);
 689         }
 690
 691         /* globals for writing */
 692         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 693         /* globals for reading */
 694         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 695                                 (struct pipe_resource*)pool->bo);
 696
 697         /* constants for reading, LLVM puts them in text segment */
 698         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 699                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 700 }
 701
 702 /**
 703  * This function initializes all the compute specific registers that need to
 704  * be initialized for each compute command stream.  Registers that are common
 705  * to both compute and 3D will be initialized at the beginning of each compute
 706  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 707  * packet requires that the shader type bit be set, we must initialize all
 708  * context registers needed for compute in this function.  The registers
 709  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 710  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 711  * on the GPU family.
 712  */
 713 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 714 {
 715         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 716         int num_threads;
 717         int num_stack_entries;
 718
 719         /* since all required registers are initialized in the
 720          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 721          */
 722         r600_init_command_buffer(cb, 256);
 723         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 724
 725         /* This must be first. */
 726         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 727         r600_store_value(cb, 0x80000000);
 728         r600_store_value(cb, 0x80000000);
 729
 730         /* We're setting config registers here. */
 731         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 732         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 733
 734         switch (rctx->b.family) {
 735         case CHIP_CEDAR:
 736         default:
 737                 num_threads = 128;
 738                 num_stack_entries = 256;
 739                 break;
 740         case CHIP_REDWOOD:
 741                 num_threads = 128;
 742                 num_stack_entries = 256;
 743                 break;
 744         case CHIP_JUNIPER:
 745                 num_threads = 128;
 746                 num_stack_entries = 512;
 747                 break;
 748         case CHIP_CYPRESS:
 749         case CHIP_HEMLOCK:
 750                 num_threads = 128;
 751                 num_stack_entries = 512;
 752                 break;
 753         case CHIP_PALM:
 754                 num_threads = 128;
 755                 num_stack_entries = 256;
 756                 break;
 757         case CHIP_SUMO:
 758                 num_threads = 128;
 759                 num_stack_entries = 256;
 760                 break;
 761         case CHIP_SUMO2:
 762                 num_threads = 128;
 763                 num_stack_entries = 512;
 764                 break;
 765         case CHIP_BARTS:
 766                 num_threads = 128;
 767                 num_stack_entries = 512;
 768                 break;
 769         case CHIP_TURKS:
 770                 num_threads = 128;
 771                 num_stack_entries = 256;
 772                 break;
 773         case CHIP_CAICOS:
 774                 num_threads = 128;
 775                 num_stack_entries = 256;
 776                 break;
 777         }
 778
 779         /* Config Registers */
 780         if (rctx->b.chip_class < CAYMAN)
 781                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 782                                            rctx->screen->b.info.drm_minor);
 783         else
 784                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 785                                         rctx->screen->b.info.drm_minor);
 786
 787         /* The primitive type always needs to be POINTLIST for compute. */
 788         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 789                                                 V_008958_DI_PT_POINTLIST);
 790
 791         if (rctx->b.chip_class < CAYMAN) {
 792
 793                 /* These registers control which simds can be used by each stage.
 794                  * The default for these registers is 0xffffffff, which means
 795                  * all simds are available for each stage.  It's possible we may
 796                  * want to play around with these in the future, but for now
 797                  * the default value is fine.
 798                  *
 799                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 800                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 801                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 802                  */
 803
 804                 /* XXX: We may need to adjust the thread and stack resource
 805                  * values for 3D/compute interop */
 806
 807                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 808
 809                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 810                  * Set the number of threads used by the PS/VS/GS/ES stage to
 811                  * 0.
 812                  */
 813                 r600_store_value(cb, 0);
 814
 815                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 816                  * Set the number of threads used by the CS (aka LS) stage to
 817                  * the maximum number of threads and set the number of threads
 818                  * for the HS stage to 0. */
 819                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 820
 821                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 822                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 823                 r600_store_value(cb, 0);
 824
 825                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 826                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 827                 r600_store_value(cb, 0);
 828
 829                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 830                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 831                  * set it to the maximum value for the CS (aka LS) stage. */
 832                 r600_store_value(cb,
 833                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 834         }
 835         /* Give the compute shader all the available LDS space.
 836          * NOTE: This only sets the maximum number of dwords that a compute
 837          * shader can allocate.  When a shader is executed, we still need to
 838          * allocate the appropriate amount of LDS dwords using the
 839          * CM_R_0288E8_SQ_LDS_ALLOC register.
 840          */
 841         if (rctx->b.chip_class < CAYMAN) {
 842                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 843                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 844         } else {
 845                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 846                         S_0286FC_NUM_PS_LDS(0) |
 847                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 848         }
 849
 850         /* Context Registers */
 851
 852         if (rctx->b.chip_class < CAYMAN) {
 853                 /* workaround for hw issues with dyn gpr - must set all limits
 854                  * to 240 instead of 0, 0x1e == 240 / 8
 855                  */
 856                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 857                                 S_028838_PS_GPRS(0x1e) |
 858                                 S_028838_VS_GPRS(0x1e) |
 859                                 S_028838_GS_GPRS(0x1e) |
 860                                 S_028838_ES_GPRS(0x1e) |
 861                                 S_028838_HS_GPRS(0x1e) |
 862                                 S_028838_LS_GPRS(0x1e));
 863         }
 864
 865         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 866         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 867                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 868
 869         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 870
 871         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 872                                                 S_0286E8_TID_IN_GROUP_ENA
 873                                                 | S_0286E8_TGID_ENA
 874                                                 | S_0286E8_DISABLE_INDEX_PACK)
 875                                                 ;
 876
 877         /* The LOOP_CONST registers are an optimizations for loops that allows
 878          * you to store the initial counter, increment value, and maximum
 879          * counter value in a register so that hardware can calculate the
 880          * correct number of iterations for the loop, so that you don't need
 881          * to have the loop counter in your shader code.  We don't currently use
 882          * this optimization, so we must keep track of the counter in the
 883          * shader and use a break instruction to exit loops.  However, the
 884          * hardware will still uses this register to determine when to exit a
 885          * loop, so we need to initialize the counter to 0, set the increment
 886          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 887          * is the maximum value allowed.  This gives us a maximum of 4096
 888          * iterations for our loops, but hopefully our break instruction will
 889          * execute before some time before the 4096th iteration.
 890          */
 891         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 892 }
 893
 894 void evergreen_init_compute_state_functions(struct r600_context *rctx)
 895 {
 896         rctx->b.b.create_compute_state = evergreen_create_compute_state;
 897         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 898         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 899 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 900         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 901         rctx->b.b.set_global_binding = evergreen_set_global_binding;
 902         rctx->b.b.launch_grid = evergreen_launch_grid;
 903
 904 }
 905
 906 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 907                                               struct pipe_resource *resource,
 908                                               unsigned level,
 909                                               unsigned usage,
 910                                               const struct pipe_box *box,
 911                                               struct pipe_transfer **ptransfer)
 912 {
 913         struct r600_context *rctx = (struct r600_context*)ctx;
 914         struct compute_memory_pool *pool = rctx->screen->global_pool;
 915         struct r600_resource_global* buffer =
 916                 (struct r600_resource_global*)resource;
 917
 918         struct compute_memory_item *item = buffer->chunk;
 919         struct pipe_resource *dst = NULL;
 920         unsigned offset = box->x;
 921
 922         if (is_item_in_pool(item)) {
 923                 compute_memory_demote_item(pool, item, ctx);
 924         }
 925         else {
 926                 if (item->real_buffer == NULL) {
 927                         item->real_buffer =
 928                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 929                 }
 930         }
 931
 932         dst = (struct pipe_resource*)item->real_buffer;
 933
 934         if (usage & PIPE_TRANSFER_READ)
 935                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 936
 937         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 938                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 939                         "width = %u, height = %u, depth = %u)\n", level, usage,
 940                         box->x, box->y, box->z, box->width, box->height,
 941                         box->depth);
 942         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 943                 "%u (box.x)\n", item->id, box->x);
 944
 945
 946         assert(resource->target == PIPE_BUFFER);
 947         assert(resource->bind & PIPE_BIND_GLOBAL);
 948         assert(box->x >= 0);
 949         assert(box->y == 0);
 950         assert(box->z == 0);
 951
 952         ///TODO: do it better, mapping is not possible if the pool is too big
 953         return pipe_buffer_map_range(ctx, dst,
 954                         offset, box->width, usage, ptransfer);
 955 }
 956
 957 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 958                                                struct pipe_transfer *transfer)
 959 {
 960         /* struct r600_resource_global are not real resources, they just map
 961          * to an offset within the compute memory pool.  The function
 962          * r600_compute_global_transfer_map() maps the memory pool
 963          * resource rather than the struct r600_resource_global passed to
 964          * it as an argument and then initalizes ptransfer->resource with
 965          * the memory pool resource (via pipe_buffer_map_range).
 966          * When transfer_unmap is called it uses the memory pool's
 967          * vtable which calls r600_buffer_transfer_map() rather than
 968          * this function.
 969          */
 970         assert (!"This function should not be called");
 971 }
 972
 973 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
 974                                                       struct pipe_transfer *transfer,
 975                                                       const struct pipe_box *box)
 976 {
 977         assert(0 && "TODO");
 978 }
 979
 980 static void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
 981                                                       struct pipe_resource *resource,
 982                                                       unsigned level,
 983                                                       unsigned usage,
 984                                                       const struct pipe_box *box,
 985                                                       const void *data,
 986                                                       unsigned stride,
 987                                                       unsigned layer_stride)
 988 {
 989         assert(0 && "TODO");
 990 }
 991
 992 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 993                                                struct pipe_resource *res)
 994 {
 995         struct r600_resource_global* buffer = NULL;
 996         struct r600_screen* rscreen = NULL;
 997
 998         assert(res->target == PIPE_BUFFER);
 999         assert(res->bind & PIPE_BIND_GLOBAL);
1000
1001         buffer = (struct r600_resource_global*)res;
1002         rscreen = (struct r600_screen*)screen;
1003
1004         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1005
1006         buffer->chunk = NULL;
1007         free(res);
1008 }
1009
1010 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1011 {
1012         u_default_resource_get_handle, /* get_handle */
1013         r600_compute_global_buffer_destroy, /* resource_destroy */
1014         r600_compute_global_transfer_map, /* transfer_map */
1015         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1016         r600_compute_global_transfer_unmap, /* transfer_unmap */
1017         r600_compute_global_transfer_inline_write /* transfer_inline_write */
1018 };
1019
1020 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1021                                                         const struct pipe_resource *templ)
1022 {
1023         struct r600_resource_global* result = NULL;
1024         struct r600_screen* rscreen = NULL;
1025         int size_in_dw = 0;
1026
1027         assert(templ->target == PIPE_BUFFER);
1028         assert(templ->bind & PIPE_BIND_GLOBAL);
1029         assert(templ->array_size == 1 || templ->array_size == 0);
1030         assert(templ->depth0 == 1 || templ->depth0 == 0);
1031         assert(templ->height0 == 1 || templ->height0 == 0);
1032
1033         result = (struct r600_resource_global*)
1034         CALLOC(sizeof(struct r600_resource_global), 1);
1035         rscreen = (struct r600_screen*)screen;
1036
1037         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1038         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1039                         templ->array_size);
1040
1041         result->base.b.vtbl = &r600_global_buffer_vtbl;
1042         result->base.b.b = *templ;
1043         result->base.b.b.screen = screen;
1044         pipe_reference_init(&result->base.b.b.reference, 1);
1045
1046         size_in_dw = (templ->width0+3) / 4;
1047
1048         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1049
1050         if (result->chunk == NULL)
1051         {
1052                 free(result);
1053                 return NULL;
1054         }
1055
1056         return &result->base.b.b;
1057 }