src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon_llvm_util.h"
  51 #endif
  52 #include <inttypes.h>
  53
  54 /**
  55 RAT0 is for global binding write
  56 VTX1 is for global binding read
  57
  58 for wrting images RAT1...
  59 for reading images TEX2...
  60   TEX2-RAT1 is paired
  61
  62 TEX2... consumes the same fetch resources, that VTX2... would consume
  63
  64 CONST0 and VTX0 is for parameters
  65   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  66   also constant cached
  67   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  68   the constant cache can handle
  69
  70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  72 we should reserve another one too.=> 10 image binding for writing max.
  73
  74 from Nvidia OpenCL:
  75   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  76   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  77
  78 so 10 for writing is enough. 176 is the max for reading according to the docs
  79
  80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  81 writable images will consume TEX slots, VTX slots too because of linear indexing
  82
  83 */
  84
  85 struct r600_resource* r600_compute_buffer_alloc_vram(
  86        struct r600_screen *screen,
  87        unsigned size)
  88 {
  89         struct pipe_resource * buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create(
  93                 (struct pipe_screen*) screen,
  94                 PIPE_BIND_CUSTOM,
  95                 PIPE_USAGE_IMMUTABLE,
  96                 size);
  97
  98         return (struct r600_resource *)buffer;
  99 }
 100
 101
 102 static void evergreen_set_rat(
 103         struct r600_pipe_compute *pipe,
 104         unsigned id,
 105         struct r600_resource* bo,
 106         int start,
 107         int size)
 108 {
 109         struct pipe_surface rat_templ;
 110         struct r600_surface *surf = NULL;
 111         struct r600_context *rctx = NULL;
 112
 113         assert(id < 12);
 114         assert((size & 3) == 0);
 115         assert((start & 0xFF) == 0);
 116
 117         rctx = pipe->ctx;
 118
 119         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 120
 121         /* Create the RAT surface */
 122         memset(&rat_templ, 0, sizeof(rat_templ));
 123         rat_templ.format = PIPE_FORMAT_R32_UINT;
 124         rat_templ.u.tex.level = 0;
 125         rat_templ.u.tex.first_layer = 0;
 126         rat_templ.u.tex.last_layer = 0;
 127
 128         /* Add the RAT the list of color buffers */
 129         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 130                 (struct pipe_context *)pipe->ctx,
 131                 (struct pipe_resource *)bo, &rat_templ);
 132
 133         /* Update the number of color buffers */
 134         pipe->ctx->framebuffer.state.nr_cbufs =
 135                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 136
 137         /* Update the cb_target_mask
 138          * XXX: I think this is a potential spot for bugs once we start doing
 139          * GL interop.  cb_target_mask may be modified in the 3D sections
 140          * of this driver. */
 141         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 142
 143         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 144         evergreen_init_color_surface_rat(rctx, surf);
 145 }
 146
 147 static void evergreen_cs_set_vertex_buffer(
 148         struct r600_context * rctx,
 149         unsigned vb_index,
 150         unsigned offset,
 151         struct pipe_resource * buffer)
 152 {
 153         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 154         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 155         vb->stride = 1;
 156         vb->buffer_offset = offset;
 157         vb->buffer = buffer;
 158         vb->user_buffer = NULL;
 159
 160         /* The vertex instructions in the compute shaders use the texture cache,
 161          * so we need to invalidate it. */
 162         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 163         state->enabled_mask |= 1 << vb_index;
 164         state->dirty_mask |= 1 << vb_index;
 165         state->atom.dirty = true;
 166 }
 167
 168 static void evergreen_cs_set_constant_buffer(
 169         struct r600_context * rctx,
 170         unsigned cb_index,
 171         unsigned offset,
 172         unsigned size,
 173         struct pipe_resource * buffer)
 174 {
 175         struct pipe_constant_buffer cb;
 176         cb.buffer_size = size;
 177         cb.buffer_offset = offset;
 178         cb.buffer = buffer;
 179         cb.user_buffer = NULL;
 180
 181         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 182 }
 183
 184 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 185 {
 186         u_default_resource_get_handle, /* get_handle */
 187         r600_compute_global_buffer_destroy, /* resource_destroy */
 188         r600_compute_global_transfer_map, /* transfer_map */
 189         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 190         r600_compute_global_transfer_unmap, /* transfer_unmap */
 191         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 192 };
 193
 194
 195 void *evergreen_create_compute_state(
 196         struct pipe_context *ctx_,
 197         const const struct pipe_compute_state *cso)
 198 {
 199         struct r600_context *ctx = (struct r600_context *)ctx_;
 200         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 201
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const unsigned char * code;
 205         unsigned i;
 206
 207         shader->llvm_ctx = LLVMContextCreate();
 208
 209         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 210
 211         header = cso->prog;
 212         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 213 #endif
 214
 215         shader->ctx = (struct r600_context*)ctx;
 216         shader->local_size = cso->req_local_mem;
 217         shader->private_size = cso->req_private_mem;
 218         shader->input_size = cso->req_input_mem;
 219
 220 #ifdef HAVE_OPENCL
 221         shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx, code,
 222                                                         header->num_bytes);
 223         shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
 224
 225         for (i = 0; i < shader->num_kernels; i++) {
 226                 struct r600_kernel *kernel = &shader->kernels[i];
 227                 kernel->llvm_module = radeon_llvm_get_kernel_module(shader->llvm_ctx, i,
 228                                                         code, header->num_bytes);
 229         }
 230 #endif
 231         return shader;
 232 }
 233
 234 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 235 {
 236         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 237
 238         if (!shader)
 239                 return;
 240
 241         FREE(shader->kernels);
 242
 243 #ifdef HAVE_OPENCL
 244         if (shader->llvm_ctx){
 245                 LLVMContextDispose(shader->llvm_ctx);
 246         }
 247 #endif
 248
 249         FREE(shader);
 250 }
 251
 252 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 253 {
 254         struct r600_context *ctx = (struct r600_context *)ctx_;
 255
 256         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 257
 258         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 259 }
 260
 261 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 262  * kernel parameters there are implicit parameters that need to be stored
 263  * in the vertex buffer as well.  Here is how these parameters are organized in
 264  * the buffer:
 265  *
 266  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 267  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 268  * DWORDS 6-8: Number of work items within each work group in each dimension
 269  *             (x,y,z)
 270  * DWORDS 9+ : Kernel parameters
 271  */
 272 void evergreen_compute_upload_input(
 273         struct pipe_context *ctx_,
 274         const uint *block_layout,
 275         const uint *grid_layout,
 276         const void *input)
 277 {
 278         struct r600_context *ctx = (struct r600_context *)ctx_;
 279         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 280         unsigned i;
 281         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 282          * parameters.
 283          */
 284         unsigned input_size = shader->input_size + 36;
 285         uint32_t * num_work_groups_start;
 286         uint32_t * global_size_start;
 287         uint32_t * local_size_start;
 288         uint32_t * kernel_parameters_start;
 289         struct pipe_box box;
 290         struct pipe_transfer *transfer = NULL;
 291
 292         if (shader->input_size == 0) {
 293                 return;
 294         }
 295
 296         if (!shader->kernel_param) {
 297                 /* Add space for the grid dimensions */
 298                 shader->kernel_param = (struct r600_resource *)
 299                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 300                                         PIPE_USAGE_IMMUTABLE, input_size);
 301         }
 302
 303         u_box_1d(0, input_size, &box);
 304         num_work_groups_start = ctx_->transfer_map(ctx_,
 305                         (struct pipe_resource*)shader->kernel_param,
 306                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 307                         &box, &transfer);
 308         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 309         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 310         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 311
 312         /* Copy the work group size */
 313         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 314
 315         /* Copy the global size */
 316         for (i = 0; i < 3; i++) {
 317                 global_size_start[i] = grid_layout[i] * block_layout[i];
 318         }
 319
 320         /* Copy the local dimensions */
 321         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 322
 323         /* Copy the kernel inputs */
 324         memcpy(kernel_parameters_start, input, shader->input_size);
 325
 326         for (i = 0; i < (input_size / 4); i++) {
 327                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 328                         ((unsigned*)num_work_groups_start)[i]);
 329         }
 330
 331         ctx_->transfer_unmap(ctx_, transfer);
 332
 333         /* ID=0 is reserved for the parameters */
 334         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 335                         (struct pipe_resource*)shader->kernel_param);
 336 }
 337
 338 static void evergreen_emit_direct_dispatch(
 339                 struct r600_context *rctx,
 340                 const uint *block_layout, const uint *grid_layout)
 341 {
 342         int i;
 343         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 344         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 345         unsigned num_waves;
 346         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 347         unsigned wave_divisor = (16 * num_pipes);
 348         int group_size = 1;
 349         int grid_size = 1;
 350         unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
 351
 352         /* Calculate group_size/grid_size */
 353         for (i = 0; i < 3; i++) {
 354                 group_size *= block_layout[i];
 355         }
 356
 357         for (i = 0; i < 3; i++) {
 358                 grid_size *= grid_layout[i];
 359         }
 360
 361         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 362         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 363                         wave_divisor - 1) / wave_divisor;
 364
 365         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 366                                 "%u wavefronts per thread block, "
 367                                 "allocating %u dwords lds.\n",
 368                                 num_pipes, num_waves, lds_size);
 369
 370         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 371
 372         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 373         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 374         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 375         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 376
 377         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 378                                                                 group_size);
 379
 380         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 381         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 382         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 383         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 384
 385         if (rctx->b.chip_class < CAYMAN) {
 386                 assert(lds_size <= 8192);
 387         } else {
 388                 /* Cayman appears to have a slightly smaller limit, see the
 389                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 390                 assert(lds_size <= 8160);
 391         }
 392
 393         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 394                                         lds_size | (num_waves << 14));
 395
 396         /* Dispatch packet */
 397         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 398         radeon_emit(cs, grid_layout[0]);
 399         radeon_emit(cs, grid_layout[1]);
 400         radeon_emit(cs, grid_layout[2]);
 401         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 402         radeon_emit(cs, 1);
 403 }
 404
 405 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 406                 const uint *grid_layout)
 407 {
 408         struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 409         unsigned i;
 410
 411         /* make sure that the gfx ring is only one active */
 412         if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
 413                 ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 414         }
 415
 416         /* Initialize all the compute-related registers.
 417          *
 418          * See evergreen_init_atom_start_compute_cs() in this file for the list
 419          * of registers initialized by the start_compute_cs_cmd atom.
 420          */
 421         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 422
 423         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 424         r600_flush_emit(ctx);
 425
 426         /* Emit colorbuffers. */
 427         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 428         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 429                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 430                 unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
 431                                                        (struct r600_resource*)cb->base.texture,
 432                                                        RADEON_USAGE_READWRITE,
 433                                                        RADEON_PRIO_SHADER_RESOURCE_RW);
 434
 435                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 436                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 437                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 438                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 439                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 440                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 441                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 442                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 443
 444                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 445                 radeon_emit(cs, reloc);
 446
 447                 if (!ctx->keep_tiling_flags) {
 448                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 449                         radeon_emit(cs, reloc);
 450                 }
 451
 452                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 453                 radeon_emit(cs, reloc);
 454         }
 455         if (ctx->keep_tiling_flags) {
 456                 for (; i < 8 ; i++) {
 457                         r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 458                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 459                 }
 460                 for (; i < 12; i++) {
 461                         r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 462                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 463                 }
 464         }
 465
 466         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 467         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 468                                         ctx->compute_cb_target_mask);
 469
 470
 471         /* Emit vertex buffer state */
 472         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 473         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 474
 475         /* Emit constant buffer state */
 476         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 477
 478         /* Emit compute shader state */
 479         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 480
 481         /* Emit dispatch state and dispatch packet */
 482         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 483
 484         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 485          */
 486         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 487                       R600_CONTEXT_INV_VERTEX_CACHE |
 488                       R600_CONTEXT_INV_TEX_CACHE;
 489         r600_flush_emit(ctx);
 490         ctx->b.flags = 0;
 491
 492         if (ctx->b.chip_class >= CAYMAN) {
 493                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 494                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 495                 /* DEALLOC_STATE prevents the GPU from hanging when a
 496                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 497                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 498                  */
 499                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 500                 cs->buf[cs->cdw++] = 0;
 501         }
 502
 503 #if 0
 504         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 505         for (i = 0; i < cs->cdw; i++) {
 506                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 507         }
 508 #endif
 509
 510 }
 511
 512
 513 /**
 514  * Emit function for r600_cs_shader_state atom
 515  */
 516 void evergreen_emit_cs_shader(
 517                 struct r600_context *rctx,
 518                 struct r600_atom *atom)
 519 {
 520         struct r600_cs_shader_state *state =
 521                                         (struct r600_cs_shader_state*)atom;
 522         struct r600_pipe_compute *shader = state->shader;
 523         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 524         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 525
 526         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 527         radeon_emit(cs, kernel->code_bo->gpu_address >> 8); /* R_0288D0_SQ_PGM_START_LS */
 528         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 529                         S_0288D4_NUM_GPRS(kernel->bc.ngpr)
 530                         | S_0288D4_STACK_SIZE(kernel->bc.nstack));
 531         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 532
 533         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 534         radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
 535                                               kernel->code_bo, RADEON_USAGE_READ,
 536                                               RADEON_PRIO_SHADER_DATA));
 537 }
 538
 539 static void evergreen_launch_grid(
 540                 struct pipe_context *ctx_,
 541                 const uint *block_layout, const uint *grid_layout,
 542                 uint32_t pc, const void *input)
 543 {
 544         struct r600_context *ctx = (struct r600_context *)ctx_;
 545
 546         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 547         struct r600_kernel *kernel = &shader->kernels[pc];
 548
 549         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 550
 551 #ifdef HAVE_OPENCL
 552
 553         if (!kernel->code_bo) {
 554                 void *p;
 555                 struct r600_bytecode *bc = &kernel->bc;
 556                 LLVMModuleRef mod = kernel->llvm_module;
 557                 boolean use_kill = false;
 558                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 559                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 560                 unsigned sb_disasm = use_sb ||
 561                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 562
 563                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 564                            ctx->screen->has_compressed_msaa_texturing);
 565                 bc->type = TGSI_PROCESSOR_COMPUTE;
 566                 bc->isa = ctx->isa;
 567                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 568
 569                 if (dump && !sb_disasm) {
 570                         r600_bytecode_disasm(bc);
 571                 } else if ((dump && sb_disasm) || use_sb) {
 572                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 573                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 574                 }
 575
 576                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 577                                                         kernel->bc.ndw * 4);
 578                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 579                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 580                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 581         }
 582 #endif
 583         shader->active_kernel = kernel;
 584         ctx->cs_shader_state.kernel_index = pc;
 585         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 586         compute_emit_cs(ctx, block_layout, grid_layout);
 587 }
 588
 589 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 590                 unsigned start, unsigned count,
 591                 struct pipe_surface ** surfaces)
 592 {
 593         struct r600_context *ctx = (struct r600_context *)ctx_;
 594         struct r600_surface **resources = (struct r600_surface **)surfaces;
 595
 596         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 597                         start, count);
 598
 599         for (unsigned i = 0; i < count; i++) {
 600                 /* The First two vertex buffers are reserved for parameters and
 601                  * global buffers. */
 602                 unsigned vtx_id = 2 + i;
 603                 if (resources[i]) {
 604                         struct r600_resource_global *buffer =
 605                                 (struct r600_resource_global*)
 606                                 resources[i]->base.texture;
 607                         if (resources[i]->base.writable) {
 608                                 assert(i+1 < 12);
 609
 610                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 611                                 (struct r600_resource *)resources[i]->base.texture,
 612                                 buffer->chunk->start_in_dw*4,
 613                                 resources[i]->base.texture->width0);
 614                         }
 615
 616                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 617                                         buffer->chunk->start_in_dw * 4,
 618                                         resources[i]->base.texture);
 619                 }
 620         }
 621 }
 622
 623 void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 624                 unsigned start_slot, unsigned count,
 625                 struct pipe_sampler_view **views)
 626 {
 627         struct r600_pipe_sampler_view **resource =
 628                 (struct r600_pipe_sampler_view **)views;
 629
 630         for (unsigned i = 0; i < count; i++)    {
 631                 if (resource[i]) {
 632                         assert(i+1 < 12);
 633                         /* XXX: Implement */
 634                         assert(!"Compute samplers not implemented.");
 635                         ///FETCH0 = VTX0 (param buffer),
 636                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 637                 }
 638         }
 639 }
 640
 641
 642 static void evergreen_set_global_binding(
 643         struct pipe_context *ctx_, unsigned first, unsigned n,
 644         struct pipe_resource **resources,
 645         uint32_t **handles)
 646 {
 647         struct r600_context *ctx = (struct r600_context *)ctx_;
 648         struct compute_memory_pool *pool = ctx->screen->global_pool;
 649         struct r600_resource_global **buffers =
 650                 (struct r600_resource_global **)resources;
 651         unsigned i;
 652
 653         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 654                         first, n);
 655
 656         if (!resources) {
 657                 /* XXX: Unset */
 658                 return;
 659         }
 660
 661         /* We mark these items for promotion to the pool if they
 662          * aren't already there */
 663         for (i = first; i < first + n; i++) {
 664                 struct compute_memory_item *item = buffers[i]->chunk;
 665
 666                 if (!is_item_in_pool(item))
 667                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 668         }
 669
 670         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 671                 /* XXX: Unset */
 672                 return;
 673         }
 674
 675         for (i = first; i < first + n; i++)
 676         {
 677                 uint32_t buffer_offset;
 678                 uint32_t handle;
 679                 assert(resources[i]->target == PIPE_BUFFER);
 680                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 681
 682                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 683                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 684
 685                 *(handles[i]) = util_cpu_to_le32(handle);
 686         }
 687
 688         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 689         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 690                                 (struct pipe_resource*)pool->bo);
 691 }
 692
 693 /**
 694  * This function initializes all the compute specific registers that need to
 695  * be initialized for each compute command stream.  Registers that are common
 696  * to both compute and 3D will be initialized at the beginning of each compute
 697  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 698  * packet requires that the shader type bit be set, we must initialize all
 699  * context registers needed for compute in this function.  The registers
 700  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 701  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 702  * on the GPU family.
 703  */
 704 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 705 {
 706         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 707         int num_threads;
 708         int num_stack_entries;
 709
 710         /* since all required registers are initialised in the
 711          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 712          */
 713         r600_init_command_buffer(cb, 256);
 714         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 715
 716         /* This must be first. */
 717         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 718         r600_store_value(cb, 0x80000000);
 719         r600_store_value(cb, 0x80000000);
 720
 721         /* We're setting config registers here. */
 722         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 723         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 724
 725         switch (ctx->b.family) {
 726         case CHIP_CEDAR:
 727         default:
 728                 num_threads = 128;
 729                 num_stack_entries = 256;
 730                 break;
 731         case CHIP_REDWOOD:
 732                 num_threads = 128;
 733                 num_stack_entries = 256;
 734                 break;
 735         case CHIP_JUNIPER:
 736                 num_threads = 128;
 737                 num_stack_entries = 512;
 738                 break;
 739         case CHIP_CYPRESS:
 740         case CHIP_HEMLOCK:
 741                 num_threads = 128;
 742                 num_stack_entries = 512;
 743                 break;
 744         case CHIP_PALM:
 745                 num_threads = 128;
 746                 num_stack_entries = 256;
 747                 break;
 748         case CHIP_SUMO:
 749                 num_threads = 128;
 750                 num_stack_entries = 256;
 751                 break;
 752         case CHIP_SUMO2:
 753                 num_threads = 128;
 754                 num_stack_entries = 512;
 755                 break;
 756         case CHIP_BARTS:
 757                 num_threads = 128;
 758                 num_stack_entries = 512;
 759                 break;
 760         case CHIP_TURKS:
 761                 num_threads = 128;
 762                 num_stack_entries = 256;
 763                 break;
 764         case CHIP_CAICOS:
 765                 num_threads = 128;
 766                 num_stack_entries = 256;
 767                 break;
 768         }
 769
 770         /* Config Registers */
 771         if (ctx->b.chip_class < CAYMAN)
 772                 evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 773                                            ctx->screen->b.info.drm_minor);
 774         else
 775                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 776                                         ctx->screen->b.info.drm_minor);
 777
 778         /* The primitive type always needs to be POINTLIST for compute. */
 779         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 780                                                 V_008958_DI_PT_POINTLIST);
 781
 782         if (ctx->b.chip_class < CAYMAN) {
 783
 784                 /* These registers control which simds can be used by each stage.
 785                  * The default for these registers is 0xffffffff, which means
 786                  * all simds are available for each stage.  It's possible we may
 787                  * want to play around with these in the future, but for now
 788                  * the default value is fine.
 789                  *
 790                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 791                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 792                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 793                  */
 794
 795                 /* XXX: We may need to adjust the thread and stack resouce
 796                  * values for 3D/compute interop */
 797
 798                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 799
 800                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 801                  * Set the number of threads used by the PS/VS/GS/ES stage to
 802                  * 0.
 803                  */
 804                 r600_store_value(cb, 0);
 805
 806                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 807                  * Set the number of threads used by the CS (aka LS) stage to
 808                  * the maximum number of threads and set the number of threads
 809                  * for the HS stage to 0. */
 810                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 811
 812                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 813                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 814                 r600_store_value(cb, 0);
 815
 816                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 817                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 818                 r600_store_value(cb, 0);
 819
 820                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 821                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 822                  * set it to the maximum value for the CS (aka LS) stage. */
 823                 r600_store_value(cb,
 824                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 825         }
 826         /* Give the compute shader all the available LDS space.
 827          * NOTE: This only sets the maximum number of dwords that a compute
 828          * shader can allocate.  When a shader is executed, we still need to
 829          * allocate the appropriate amount of LDS dwords using the
 830          * CM_R_0288E8_SQ_LDS_ALLOC register.
 831          */
 832         if (ctx->b.chip_class < CAYMAN) {
 833                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 834                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 835         } else {
 836                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 837                         S_0286FC_NUM_PS_LDS(0) |
 838                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 839         }
 840
 841         /* Context Registers */
 842
 843         if (ctx->b.chip_class < CAYMAN) {
 844                 /* workaround for hw issues with dyn gpr - must set all limits
 845                  * to 240 instead of 0, 0x1e == 240 / 8
 846                  */
 847                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 848                                 S_028838_PS_GPRS(0x1e) |
 849                                 S_028838_VS_GPRS(0x1e) |
 850                                 S_028838_GS_GPRS(0x1e) |
 851                                 S_028838_ES_GPRS(0x1e) |
 852                                 S_028838_HS_GPRS(0x1e) |
 853                                 S_028838_LS_GPRS(0x1e));
 854         }
 855
 856         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 857         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 858                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 859
 860         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 861
 862         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 863                                                 S_0286E8_TID_IN_GROUP_ENA
 864                                                 | S_0286E8_TGID_ENA
 865                                                 | S_0286E8_DISABLE_INDEX_PACK)
 866                                                 ;
 867
 868         /* The LOOP_CONST registers are an optimizations for loops that allows
 869          * you to store the initial counter, increment value, and maximum
 870          * counter value in a register so that hardware can calculate the
 871          * correct number of iterations for the loop, so that you don't need
 872          * to have the loop counter in your shader code.  We don't currently use
 873          * this optimization, so we must keep track of the counter in the
 874          * shader and use a break instruction to exit loops.  However, the
 875          * hardware will still uses this register to determine when to exit a
 876          * loop, so we need to initialize the counter to 0, set the increment
 877          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 878          * is the maximum value allowed.  This gives us a maximum of 4096
 879          * iterations for our loops, but hopefully our break instruction will
 880          * execute before some time before the 4096th iteration.
 881          */
 882         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 883 }
 884
 885 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 886 {
 887         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 888         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 889         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 890 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 891         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 892         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 893         ctx->b.b.launch_grid = evergreen_launch_grid;
 894
 895         /* We always use at least one vertex buffer for parameters (id = 1)*/
 896         ctx->cs_vertex_buffer_state.enabled_mask =
 897         ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
 898 }
 899
 900 struct pipe_resource *r600_compute_global_buffer_create(
 901         struct pipe_screen *screen,
 902         const struct pipe_resource *templ)
 903 {
 904         struct r600_resource_global* result = NULL;
 905         struct r600_screen* rscreen = NULL;
 906         int size_in_dw = 0;
 907
 908         assert(templ->target == PIPE_BUFFER);
 909         assert(templ->bind & PIPE_BIND_GLOBAL);
 910         assert(templ->array_size == 1 || templ->array_size == 0);
 911         assert(templ->depth0 == 1 || templ->depth0 == 0);
 912         assert(templ->height0 == 1 || templ->height0 == 0);
 913
 914         result = (struct r600_resource_global*)
 915         CALLOC(sizeof(struct r600_resource_global), 1);
 916         rscreen = (struct r600_screen*)screen;
 917
 918         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 919         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 920                         templ->array_size);
 921
 922         result->base.b.vtbl = &r600_global_buffer_vtbl;
 923         result->base.b.b.screen = screen;
 924         result->base.b.b = *templ;
 925         pipe_reference_init(&result->base.b.b.reference, 1);
 926
 927         size_in_dw = (templ->width0+3) / 4;
 928
 929         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 930
 931         if (result->chunk == NULL)
 932         {
 933                 free(result);
 934                 return NULL;
 935         }
 936
 937         return &result->base.b.b;
 938 }
 939
 940 void r600_compute_global_buffer_destroy(
 941         struct pipe_screen *screen,
 942         struct pipe_resource *res)
 943 {
 944         struct r600_resource_global* buffer = NULL;
 945         struct r600_screen* rscreen = NULL;
 946
 947         assert(res->target == PIPE_BUFFER);
 948         assert(res->bind & PIPE_BIND_GLOBAL);
 949
 950         buffer = (struct r600_resource_global*)res;
 951         rscreen = (struct r600_screen*)screen;
 952
 953         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 954
 955         buffer->chunk = NULL;
 956         free(res);
 957 }
 958
 959 void *r600_compute_global_transfer_map(
 960         struct pipe_context *ctx_,
 961         struct pipe_resource *resource,
 962         unsigned level,
 963         unsigned usage,
 964         const struct pipe_box *box,
 965         struct pipe_transfer **ptransfer)
 966 {
 967         struct r600_context *rctx = (struct r600_context*)ctx_;
 968         struct compute_memory_pool *pool = rctx->screen->global_pool;
 969         struct r600_resource_global* buffer =
 970                 (struct r600_resource_global*)resource;
 971
 972         struct compute_memory_item *item = buffer->chunk;
 973         struct pipe_resource *dst = NULL;
 974         unsigned offset = box->x;
 975
 976         if (is_item_in_pool(item)) {
 977                 compute_memory_demote_item(pool, item, ctx_);
 978         }
 979         else {
 980                 if (item->real_buffer == NULL) {
 981                         item->real_buffer = (struct r600_resource*)
 982                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 983                 }
 984         }
 985
 986         dst = (struct pipe_resource*)item->real_buffer;
 987
 988         if (usage & PIPE_TRANSFER_READ)
 989                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 990
 991         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 992                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 993                         "width = %u, height = %u, depth = %u)\n", level, usage,
 994                         box->x, box->y, box->z, box->width, box->height,
 995                         box->depth);
 996         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 997                 "%u (box.x)\n", item->id, box->x);
 998
 999
1000         assert(resource->target == PIPE_BUFFER);
1001         assert(resource->bind & PIPE_BIND_GLOBAL);
1002         assert(box->x >= 0);
1003         assert(box->y == 0);
1004         assert(box->z == 0);
1005
1006         ///TODO: do it better, mapping is not possible if the pool is too big
1007         return pipe_buffer_map_range(ctx_, dst,
1008                         offset, box->width, usage, ptransfer);
1009 }
1010
1011 void r600_compute_global_transfer_unmap(
1012         struct pipe_context *ctx_,
1013         struct pipe_transfer* transfer)
1014 {
1015         /* struct r600_resource_global are not real resources, they just map
1016          * to an offset within the compute memory pool.  The function
1017          * r600_compute_global_transfer_map() maps the memory pool
1018          * resource rather than the struct r600_resource_global passed to
1019          * it as an argument and then initalizes ptransfer->resource with
1020          * the memory pool resource (via pipe_buffer_map_range).
1021          * When transfer_unmap is called it uses the memory pool's
1022          * vtable which calls r600_buffer_transfer_map() rather than
1023          * this function.
1024          */
1025         assert (!"This function should not be called");
1026 }
1027
1028 void r600_compute_global_transfer_flush_region(
1029         struct pipe_context *ctx_,
1030         struct pipe_transfer *transfer,
1031         const struct pipe_box *box)
1032 {
1033         assert(0 && "TODO");
1034 }
1035
1036 void r600_compute_global_transfer_inline_write(
1037         struct pipe_context *pipe,
1038         struct pipe_resource *resource,
1039         unsigned level,
1040         unsigned usage,
1041         const struct pipe_box *box,
1042         const void *data,
1043         unsigned stride,
1044         unsigned layer_stride)
1045 {
1046         assert(0 && "TODO");
1047 }