src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_resource.h"
  43 #include "r600_shader.h"
  44 #include "r600_pipe.h"
  45 #include "r600_formats.h"
  46 #include "evergreen_compute.h"
  47 #include "evergreen_compute_internal.h"
  48 #include "compute_memory_pool.h"
  49 #include "sb/sb_public.h"
  50 #ifdef HAVE_OPENCL
  51 #include "radeon_llvm_util.h"
  52 #endif
  53
  54 /**
  55 RAT0 is for global binding write
  56 VTX1 is for global binding read
  57
  58 for wrting images RAT1...
  59 for reading images TEX2...
  60   TEX2-RAT1 is paired
  61
  62 TEX2... consumes the same fetch resources, that VTX2... would consume
  63
  64 CONST0 and VTX0 is for parameters
  65   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  66   also constant cached
  67   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  68   the constant cache can handle
  69
  70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  72 we should reserve another one too.=> 10 image binding for writing max.
  73
  74 from Nvidia OpenCL:
  75   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  76   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  77
  78 so 10 for writing is enough. 176 is the max for reading according to the docs
  79
  80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  81 writable images will consume TEX slots, VTX slots too because of linear indexing
  82
  83 */
  84
  85 struct r600_resource* r600_compute_buffer_alloc_vram(
  86        struct r600_screen *screen,
  87        unsigned size)
  88 {
  89         struct pipe_resource * buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create(
  93                 (struct pipe_screen*) screen,
  94                 PIPE_BIND_CUSTOM,
  95                 PIPE_USAGE_IMMUTABLE,
  96                 size);
  97
  98         return (struct r600_resource *)buffer;
  99 }
 100
 101
 102 static void evergreen_set_rat(
 103         struct r600_pipe_compute *pipe,
 104         int id,
 105         struct r600_resource* bo,
 106         int start,
 107         int size)
 108 {
 109         struct pipe_surface rat_templ;
 110         struct r600_surface *surf = NULL;
 111         struct r600_context *rctx = NULL;
 112
 113         assert(id < 12);
 114         assert((size & 3) == 0);
 115         assert((start & 0xFF) == 0);
 116
 117         rctx = pipe->ctx;
 118
 119         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 120
 121         /* Create the RAT surface */
 122         memset(&rat_templ, 0, sizeof(rat_templ));
 123         rat_templ.format = PIPE_FORMAT_R32_UINT;
 124         rat_templ.u.tex.level = 0;
 125         rat_templ.u.tex.first_layer = 0;
 126         rat_templ.u.tex.last_layer = 0;
 127
 128         /* Add the RAT the list of color buffers */
 129         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 130                 (struct pipe_context *)pipe->ctx,
 131                 (struct pipe_resource *)bo, &rat_templ);
 132
 133         /* Update the number of color buffers */
 134         pipe->ctx->framebuffer.state.nr_cbufs =
 135                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 136
 137         /* Update the cb_target_mask
 138          * XXX: I think this is a potential spot for bugs once we start doing
 139          * GL interop.  cb_target_mask may be modified in the 3D sections
 140          * of this driver. */
 141         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 142
 143         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 144         evergreen_init_color_surface_rat(rctx, surf);
 145 }
 146
 147 static void evergreen_cs_set_vertex_buffer(
 148         struct r600_context * rctx,
 149         unsigned vb_index,
 150         unsigned offset,
 151         struct pipe_resource * buffer)
 152 {
 153         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 154         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 155         vb->stride = 1;
 156         vb->buffer_offset = offset;
 157         vb->buffer = buffer;
 158         vb->user_buffer = NULL;
 159
 160         /* The vertex instructions in the compute shaders use the texture cache,
 161          * so we need to invalidate it. */
 162         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 163         state->enabled_mask |= 1 << vb_index;
 164         state->dirty_mask |= 1 << vb_index;
 165         state->atom.dirty = true;
 166 }
 167
 168 static void evergreen_cs_set_constant_buffer(
 169         struct r600_context * rctx,
 170         unsigned cb_index,
 171         unsigned offset,
 172         unsigned size,
 173         struct pipe_resource * buffer)
 174 {
 175         struct pipe_constant_buffer cb;
 176         cb.buffer_size = size;
 177         cb.buffer_offset = offset;
 178         cb.buffer = buffer;
 179         cb.user_buffer = NULL;
 180
 181         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 182 }
 183
 184 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 185 {
 186         u_default_resource_get_handle, /* get_handle */
 187         r600_compute_global_buffer_destroy, /* resource_destroy */
 188         r600_compute_global_transfer_map, /* transfer_map */
 189         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 190         r600_compute_global_transfer_unmap, /* transfer_unmap */
 191         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 192 };
 193
 194
 195 void *evergreen_create_compute_state(
 196         struct pipe_context *ctx_,
 197         const const struct pipe_compute_state *cso)
 198 {
 199         struct r600_context *ctx = (struct r600_context *)ctx_;
 200         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 201
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const unsigned char * code;
 205         unsigned i;
 206
 207         shader->llvm_ctx = LLVMContextCreate();
 208
 209         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 210
 211         header = cso->prog;
 212         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 213 #endif
 214
 215         shader->ctx = (struct r600_context*)ctx;
 216         shader->local_size = cso->req_local_mem;
 217         shader->private_size = cso->req_private_mem;
 218         shader->input_size = cso->req_input_mem;
 219
 220 #ifdef HAVE_OPENCL
 221         shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx, code,
 222                                                         header->num_bytes);
 223         shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
 224
 225         for (i = 0; i < shader->num_kernels; i++) {
 226                 struct r600_kernel *kernel = &shader->kernels[i];
 227                 kernel->llvm_module = radeon_llvm_get_kernel_module(shader->llvm_ctx, i,
 228                                                         code, header->num_bytes);
 229         }
 230 #endif
 231         return shader;
 232 }
 233
 234 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 235 {
 236         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 237
 238         if (!shader)
 239                 return;
 240
 241 #ifdef HAVE_OPENCL
 242         if (shader->llvm_ctx){
 243                 LLVMContextDispose(shader->llvm_ctx);
 244         }
 245 #endif
 246
 247         free(shader);
 248 }
 249
 250 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 251 {
 252         struct r600_context *ctx = (struct r600_context *)ctx_;
 253
 254         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 255
 256         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 257 }
 258
 259 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 260  * kernel parameters there are implicit parameters that need to be stored
 261  * in the vertex buffer as well.  Here is how these parameters are organized in
 262  * the buffer:
 263  *
 264  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 265  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 266  * DWORDS 6-8: Number of work items within each work group in each dimension
 267  *             (x,y,z)
 268  * DWORDS 9+ : Kernel parameters
 269  */
 270 void evergreen_compute_upload_input(
 271         struct pipe_context *ctx_,
 272         const uint *block_layout,
 273         const uint *grid_layout,
 274         const void *input)
 275 {
 276         struct r600_context *ctx = (struct r600_context *)ctx_;
 277         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 278         int i;
 279         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 280          * parameters.
 281          */
 282         unsigned input_size = shader->input_size + 36;
 283         uint32_t * num_work_groups_start;
 284         uint32_t * global_size_start;
 285         uint32_t * local_size_start;
 286         uint32_t * kernel_parameters_start;
 287         struct pipe_box box;
 288         struct pipe_transfer *transfer = NULL;
 289
 290         if (shader->input_size == 0) {
 291                 return;
 292         }
 293
 294         if (!shader->kernel_param) {
 295                 /* Add space for the grid dimensions */
 296                 shader->kernel_param = (struct r600_resource *)
 297                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 298                                         PIPE_USAGE_IMMUTABLE, input_size);
 299         }
 300
 301         u_box_1d(0, input_size, &box);
 302         num_work_groups_start = ctx_->transfer_map(ctx_,
 303                         (struct pipe_resource*)shader->kernel_param,
 304                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 305                         &box, &transfer);
 306         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 307         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 308         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 309
 310         /* Copy the work group size */
 311         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 312
 313         /* Copy the global size */
 314         for (i = 0; i < 3; i++) {
 315                 global_size_start[i] = grid_layout[i] * block_layout[i];
 316         }
 317
 318         /* Copy the local dimensions */
 319         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 320
 321         /* Copy the kernel inputs */
 322         memcpy(kernel_parameters_start, input, shader->input_size);
 323
 324         for (i = 0; i < (input_size / 4); i++) {
 325                 COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
 326                         ((unsigned*)num_work_groups_start)[i]);
 327         }
 328
 329         ctx_->transfer_unmap(ctx_, transfer);
 330
 331         /* ID=0 is reserved for the parameters */
 332         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 333                         (struct pipe_resource*)shader->kernel_param);
 334 }
 335
 336 static void evergreen_emit_direct_dispatch(
 337                 struct r600_context *rctx,
 338                 const uint *block_layout, const uint *grid_layout)
 339 {
 340         int i;
 341         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 342         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 343         unsigned num_waves;
 344         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 345         unsigned wave_divisor = (16 * num_pipes);
 346         int group_size = 1;
 347         int grid_size = 1;
 348         unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
 349
 350         /* Calculate group_size/grid_size */
 351         for (i = 0; i < 3; i++) {
 352                 group_size *= block_layout[i];
 353         }
 354
 355         for (i = 0; i < 3; i++) {
 356                 grid_size *= grid_layout[i];
 357         }
 358
 359         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 360         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 361                         wave_divisor - 1) / wave_divisor;
 362
 363         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 364                                 "%u wavefronts per thread block, "
 365                                 "allocating %u dwords lds.\n",
 366                                 num_pipes, num_waves, lds_size);
 367
 368         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 369
 370         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 371         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 372         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 373         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 374
 375         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 376                                                                 group_size);
 377
 378         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 379         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 380         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 381         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 382
 383         if (rctx->b.chip_class < CAYMAN) {
 384                 assert(lds_size <= 8192);
 385         } else {
 386                 /* Cayman appears to have a slightly smaller limit, see the
 387                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 388                 assert(lds_size <= 8160);
 389         }
 390
 391         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 392                                         lds_size | (num_waves << 14));
 393
 394         /* Dispatch packet */
 395         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 396         radeon_emit(cs, grid_layout[0]);
 397         radeon_emit(cs, grid_layout[1]);
 398         radeon_emit(cs, grid_layout[2]);
 399         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 400         radeon_emit(cs, 1);
 401 }
 402
 403 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 404                 const uint *grid_layout)
 405 {
 406         struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 407         int i;
 408
 409         /* make sure that the gfx ring is only one active */
 410         if (ctx->b.rings.dma.cs) {
 411                 ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
 412         }
 413
 414         /* Initialize all the compute-related registers.
 415          *
 416          * See evergreen_init_atom_start_compute_cs() in this file for the list
 417          * of registers initialized by the start_compute_cs_cmd atom.
 418          */
 419         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 420
 421         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 422         r600_flush_emit(ctx);
 423
 424         /* Emit colorbuffers. */
 425         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 426         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 427                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 428                 unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
 429                                                        (struct r600_resource*)cb->base.texture,
 430                                                        RADEON_USAGE_READWRITE);
 431
 432                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 433                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 434                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 435                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 436                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 437                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 438                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 439                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 440
 441                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 442                 radeon_emit(cs, reloc);
 443
 444                 if (!ctx->keep_tiling_flags) {
 445                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 446                         radeon_emit(cs, reloc);
 447                 }
 448
 449                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 450                 radeon_emit(cs, reloc);
 451         }
 452         if (ctx->keep_tiling_flags) {
 453                 for (; i < 8 ; i++) {
 454                         r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 455                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 456                 }
 457                 for (; i < 12; i++) {
 458                         r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 459                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 460                 }
 461         }
 462
 463         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 464         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 465                                         ctx->compute_cb_target_mask);
 466
 467
 468         /* Emit vertex buffer state */
 469         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 470         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 471
 472         /* Emit constant buffer state */
 473         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 474
 475         /* Emit compute shader state */
 476         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 477
 478         /* Emit dispatch state and dispatch packet */
 479         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 480
 481         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 482          */
 483         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 484                       R600_CONTEXT_INV_VERTEX_CACHE |
 485                       R600_CONTEXT_INV_TEX_CACHE;
 486         r600_flush_emit(ctx);
 487         ctx->b.flags = 0;
 488
 489         if (ctx->b.chip_class >= CAYMAN) {
 490                 ctx->skip_surface_sync_on_next_cs_flush = true;
 491         }
 492
 493 #if 0
 494         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 495         for (i = 0; i < cs->cdw; i++) {
 496                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 497         }
 498 #endif
 499
 500 }
 501
 502
 503 /**
 504  * Emit function for r600_cs_shader_state atom
 505  */
 506 void evergreen_emit_cs_shader(
 507                 struct r600_context *rctx,
 508                 struct r600_atom *atom)
 509 {
 510         struct r600_cs_shader_state *state =
 511                                         (struct r600_cs_shader_state*)atom;
 512         struct r600_pipe_compute *shader = state->shader;
 513         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 514         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 515         uint64_t va;
 516
 517         va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b);
 518
 519         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 520         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 521         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 522                         S_0288D4_NUM_GPRS(kernel->bc.ngpr)
 523                         | S_0288D4_STACK_SIZE(kernel->bc.nstack));
 524         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 525
 526         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 527         radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
 528                                                         kernel->code_bo, RADEON_USAGE_READ));
 529 }
 530
 531 static void evergreen_launch_grid(
 532                 struct pipe_context *ctx_,
 533                 const uint *block_layout, const uint *grid_layout,
 534                 uint32_t pc, const void *input)
 535 {
 536         struct r600_context *ctx = (struct r600_context *)ctx_;
 537
 538         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 539         struct r600_kernel *kernel = &shader->kernels[pc];
 540
 541         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 542
 543 #ifdef HAVE_OPENCL
 544
 545         if (!kernel->code_bo) {
 546                 void *p;
 547                 struct r600_bytecode *bc = &kernel->bc;
 548                 LLVMModuleRef mod = kernel->llvm_module;
 549                 boolean use_kill = false;
 550                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 551                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 552                 unsigned sb_disasm = use_sb ||
 553                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 554
 555                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 556                            ctx->screen->has_compressed_msaa_texturing);
 557                 bc->type = TGSI_PROCESSOR_COMPUTE;
 558                 bc->isa = ctx->isa;
 559                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 560
 561                 if (dump && !sb_disasm) {
 562                         r600_bytecode_disasm(bc);
 563                 } else if ((dump && sb_disasm) || use_sb) {
 564                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 565                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 566                 }
 567
 568                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 569                                                         kernel->bc.ndw * 4);
 570                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 571                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 572                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 573         }
 574 #endif
 575         shader->active_kernel = kernel;
 576         ctx->cs_shader_state.kernel_index = pc;
 577         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 578         compute_emit_cs(ctx, block_layout, grid_layout);
 579 }
 580
 581 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 582                 unsigned start, unsigned count,
 583                 struct pipe_surface ** surfaces)
 584 {
 585         struct r600_context *ctx = (struct r600_context *)ctx_;
 586         struct r600_surface **resources = (struct r600_surface **)surfaces;
 587
 588         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 589                         start, count);
 590
 591         for (int i = 0; i < count; i++) {
 592                 /* The First two vertex buffers are reserved for parameters and
 593                  * global buffers. */
 594                 unsigned vtx_id = 2 + i;
 595                 if (resources[i]) {
 596                         struct r600_resource_global *buffer =
 597                                 (struct r600_resource_global*)
 598                                 resources[i]->base.texture;
 599                         if (resources[i]->base.writable) {
 600                                 assert(i+1 < 12);
 601
 602                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 603                                 (struct r600_resource *)resources[i]->base.texture,
 604                                 buffer->chunk->start_in_dw*4,
 605                                 resources[i]->base.texture->width0);
 606                         }
 607
 608                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 609                                         buffer->chunk->start_in_dw * 4,
 610                                         resources[i]->base.texture);
 611                 }
 612         }
 613 }
 614
 615 void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 616                 unsigned start_slot, unsigned count,
 617                 struct pipe_sampler_view **views)
 618 {
 619         struct r600_pipe_sampler_view **resource =
 620                 (struct r600_pipe_sampler_view **)views;
 621
 622         for (int i = 0; i < count; i++) {
 623                 if (resource[i]) {
 624                         assert(i+1 < 12);
 625                         /* XXX: Implement */
 626                         assert(!"Compute samplers not implemented.");
 627                         ///FETCH0 = VTX0 (param buffer),
 628                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 629                 }
 630         }
 631 }
 632
 633
 634 static void evergreen_set_global_binding(
 635         struct pipe_context *ctx_, unsigned first, unsigned n,
 636         struct pipe_resource **resources,
 637         uint32_t **handles)
 638 {
 639         struct r600_context *ctx = (struct r600_context *)ctx_;
 640         struct compute_memory_pool *pool = ctx->screen->global_pool;
 641         struct r600_resource_global **buffers =
 642                 (struct r600_resource_global **)resources;
 643
 644         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 645                         first, n);
 646
 647         if (!resources) {
 648                 /* XXX: Unset */
 649                 return;
 650         }
 651
 652         compute_memory_finalize_pending(pool, ctx_);
 653
 654         for (int i = 0; i < n; i++)
 655         {
 656                 assert(resources[i]->target == PIPE_BUFFER);
 657                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 658
 659                 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
 660         }
 661
 662         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 663         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 664                                 (struct pipe_resource*)pool->bo);
 665 }
 666
 667 /**
 668  * This function initializes all the compute specific registers that need to
 669  * be initialized for each compute command stream.  Registers that are common
 670  * to both compute and 3D will be initialized at the beginning of each compute
 671  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 672  * packet requires that the shader type bit be set, we must initialize all
 673  * context registers needed for compute in this function.  The registers
 674  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 675  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 676  * on the GPU family.
 677  */
 678 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 679 {
 680         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 681         int num_threads;
 682         int num_stack_entries;
 683
 684         /* since all required registers are initialised in the
 685          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 686          */
 687         r600_init_command_buffer(cb, 256);
 688         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 689
 690         /* This must be first. */
 691         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 692         r600_store_value(cb, 0x80000000);
 693         r600_store_value(cb, 0x80000000);
 694
 695         /* We're setting config registers here. */
 696         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 697         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 698
 699         switch (ctx->b.family) {
 700         case CHIP_CEDAR:
 701         default:
 702                 num_threads = 128;
 703                 num_stack_entries = 256;
 704                 break;
 705         case CHIP_REDWOOD:
 706                 num_threads = 128;
 707                 num_stack_entries = 256;
 708                 break;
 709         case CHIP_JUNIPER:
 710                 num_threads = 128;
 711                 num_stack_entries = 512;
 712                 break;
 713         case CHIP_CYPRESS:
 714         case CHIP_HEMLOCK:
 715                 num_threads = 128;
 716                 num_stack_entries = 512;
 717                 break;
 718         case CHIP_PALM:
 719                 num_threads = 128;
 720                 num_stack_entries = 256;
 721                 break;
 722         case CHIP_SUMO:
 723                 num_threads = 128;
 724                 num_stack_entries = 256;
 725                 break;
 726         case CHIP_SUMO2:
 727                 num_threads = 128;
 728                 num_stack_entries = 512;
 729                 break;
 730         case CHIP_BARTS:
 731                 num_threads = 128;
 732                 num_stack_entries = 512;
 733                 break;
 734         case CHIP_TURKS:
 735                 num_threads = 128;
 736                 num_stack_entries = 256;
 737                 break;
 738         case CHIP_CAICOS:
 739                 num_threads = 128;
 740                 num_stack_entries = 256;
 741                 break;
 742         }
 743
 744         /* Config Registers */
 745         if (ctx->b.chip_class < CAYMAN)
 746                 evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 747                                            ctx->screen->b.info.drm_minor);
 748         else
 749                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 750                                         ctx->screen->b.info.drm_minor);
 751
 752         /* The primitive type always needs to be POINTLIST for compute. */
 753         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 754                                                 V_008958_DI_PT_POINTLIST);
 755
 756         if (ctx->b.chip_class < CAYMAN) {
 757
 758                 /* These registers control which simds can be used by each stage.
 759                  * The default for these registers is 0xffffffff, which means
 760                  * all simds are available for each stage.  It's possible we may
 761                  * want to play around with these in the future, but for now
 762                  * the default value is fine.
 763                  *
 764                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 765                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 766                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 767                  */
 768
 769                 /* XXX: We may need to adjust the thread and stack resouce
 770                  * values for 3D/compute interop */
 771
 772                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 773
 774                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 775                  * Set the number of threads used by the PS/VS/GS/ES stage to
 776                  * 0.
 777                  */
 778                 r600_store_value(cb, 0);
 779
 780                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 781                  * Set the number of threads used by the CS (aka LS) stage to
 782                  * the maximum number of threads and set the number of threads
 783                  * for the HS stage to 0. */
 784                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 785
 786                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 787                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 788                 r600_store_value(cb, 0);
 789
 790                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 791                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 792                 r600_store_value(cb, 0);
 793
 794                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 795                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 796                  * set it to the maximum value for the CS (aka LS) stage. */
 797                 r600_store_value(cb,
 798                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 799         }
 800         /* Give the compute shader all the available LDS space.
 801          * NOTE: This only sets the maximum number of dwords that a compute
 802          * shader can allocate.  When a shader is executed, we still need to
 803          * allocate the appropriate amount of LDS dwords using the
 804          * CM_R_0288E8_SQ_LDS_ALLOC register.
 805          */
 806         if (ctx->b.chip_class < CAYMAN) {
 807                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 808                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 809         } else {
 810                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 811                         S_0286FC_NUM_PS_LDS(0) |
 812                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 813         }
 814
 815         /* Context Registers */
 816
 817         if (ctx->b.chip_class < CAYMAN) {
 818                 /* workaround for hw issues with dyn gpr - must set all limits
 819                  * to 240 instead of 0, 0x1e == 240 / 8
 820                  */
 821                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 822                                 S_028838_PS_GPRS(0x1e) |
 823                                 S_028838_VS_GPRS(0x1e) |
 824                                 S_028838_GS_GPRS(0x1e) |
 825                                 S_028838_ES_GPRS(0x1e) |
 826                                 S_028838_HS_GPRS(0x1e) |
 827                                 S_028838_LS_GPRS(0x1e));
 828         }
 829
 830         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 831         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 832                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 833
 834         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 835
 836         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 837                                                 S_0286E8_TID_IN_GROUP_ENA
 838                                                 | S_0286E8_TGID_ENA
 839                                                 | S_0286E8_DISABLE_INDEX_PACK)
 840                                                 ;
 841
 842         /* The LOOP_CONST registers are an optimizations for loops that allows
 843          * you to store the initial counter, increment value, and maximum
 844          * counter value in a register so that hardware can calculate the
 845          * correct number of iterations for the loop, so that you don't need
 846          * to have the loop counter in your shader code.  We don't currently use
 847          * this optimization, so we must keep track of the counter in the
 848          * shader and use a break instruction to exit loops.  However, the
 849          * hardware will still uses this register to determine when to exit a
 850          * loop, so we need to initialize the counter to 0, set the increment
 851          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 852          * is the maximum value allowed.  This gives us a maximum of 4096
 853          * iterations for our loops, but hopefully our break instruction will
 854          * execute before some time before the 4096th iteration.
 855          */
 856         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 857 }
 858
 859 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 860 {
 861         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 862         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 863         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 864 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 865         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 866         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 867         ctx->b.b.launch_grid = evergreen_launch_grid;
 868
 869         /* We always use at least one vertex buffer for parameters (id = 1)*/
 870         ctx->cs_vertex_buffer_state.enabled_mask =
 871         ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
 872 }
 873
 874 struct pipe_resource *r600_compute_global_buffer_create(
 875         struct pipe_screen *screen,
 876         const struct pipe_resource *templ)
 877 {
 878         struct r600_resource_global* result = NULL;
 879         struct r600_screen* rscreen = NULL;
 880         int size_in_dw = 0;
 881
 882         assert(templ->target == PIPE_BUFFER);
 883         assert(templ->bind & PIPE_BIND_GLOBAL);
 884         assert(templ->array_size == 1 || templ->array_size == 0);
 885         assert(templ->depth0 == 1 || templ->depth0 == 0);
 886         assert(templ->height0 == 1 || templ->height0 == 0);
 887
 888         result = (struct r600_resource_global*)
 889         CALLOC(sizeof(struct r600_resource_global), 1);
 890         rscreen = (struct r600_screen*)screen;
 891
 892         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 893         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 894                         templ->array_size);
 895
 896         result->base.b.vtbl = &r600_global_buffer_vtbl;
 897         result->base.b.b.screen = screen;
 898         result->base.b.b = *templ;
 899         pipe_reference_init(&result->base.b.b.reference, 1);
 900
 901         size_in_dw = (templ->width0+3) / 4;
 902
 903         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 904
 905         if (result->chunk == NULL)
 906         {
 907                 free(result);
 908                 return NULL;
 909         }
 910
 911         return &result->base.b.b;
 912 }
 913
 914 void r600_compute_global_buffer_destroy(
 915         struct pipe_screen *screen,
 916         struct pipe_resource *res)
 917 {
 918         struct r600_resource_global* buffer = NULL;
 919         struct r600_screen* rscreen = NULL;
 920
 921         assert(res->target == PIPE_BUFFER);
 922         assert(res->bind & PIPE_BIND_GLOBAL);
 923
 924         buffer = (struct r600_resource_global*)res;
 925         rscreen = (struct r600_screen*)screen;
 926
 927         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 928
 929         buffer->chunk = NULL;
 930         free(res);
 931 }
 932
 933 void *r600_compute_global_transfer_map(
 934         struct pipe_context *ctx_,
 935         struct pipe_resource *resource,
 936         unsigned level,
 937         unsigned usage,
 938         const struct pipe_box *box,
 939         struct pipe_transfer **ptransfer)
 940 {
 941         struct r600_context *rctx = (struct r600_context*)ctx_;
 942         struct compute_memory_pool *pool = rctx->screen->global_pool;
 943         struct r600_resource_global* buffer =
 944                 (struct r600_resource_global*)resource;
 945
 946         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 947                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 948                         "width = %u, height = %u, depth = %u)\n", level, usage,
 949                         box->x, box->y, box->z, box->width, box->height,
 950                         box->depth);
 951         COMPUTE_DBG(rctx->screen, "Buffer id = %u offset = "
 952                 "%u (box.x)\n", buffer->chunk->id, box->x);
 953
 954
 955         compute_memory_finalize_pending(pool, ctx_);
 956
 957         assert(resource->target == PIPE_BUFFER);
 958         assert(resource->bind & PIPE_BIND_GLOBAL);
 959         assert(box->x >= 0);
 960         assert(box->y == 0);
 961         assert(box->z == 0);
 962
 963         ///TODO: do it better, mapping is not possible if the pool is too big
 964         return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
 965                         box->x + (buffer->chunk->start_in_dw * 4),
 966                         box->width, usage, ptransfer);
 967 }
 968
 969 void r600_compute_global_transfer_unmap(
 970         struct pipe_context *ctx_,
 971         struct pipe_transfer* transfer)
 972 {
 973         /* struct r600_resource_global are not real resources, they just map
 974          * to an offset within the compute memory pool.  The function
 975          * r600_compute_global_transfer_map() maps the memory pool
 976          * resource rather than the struct r600_resource_global passed to
 977          * it as an argument and then initalizes ptransfer->resource with
 978          * the memory pool resource (via pipe_buffer_map_range).
 979          * When transfer_unmap is called it uses the memory pool's
 980          * vtable which calls r600_buffer_transfer_map() rather than
 981          * this function.
 982          */
 983         assert (!"This function should not be called");
 984 }
 985
 986 void r600_compute_global_transfer_flush_region(
 987         struct pipe_context *ctx_,
 988         struct pipe_transfer *transfer,
 989         const struct pipe_box *box)
 990 {
 991         assert(0 && "TODO");
 992 }
 993
 994 void r600_compute_global_transfer_inline_write(
 995         struct pipe_context *pipe,
 996         struct pipe_resource *resource,
 997         unsigned level,
 998         unsigned usage,
 999         const struct pipe_box *box,
1000         const void *data,
1001         unsigned stride,
1002         unsigned layer_stride)
1003 {
1004         assert(0 && "TODO");
1005 }