src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon_llvm_util.h"
  51 #endif
  52
  53 /**
  54 RAT0 is for global binding write
  55 VTX1 is for global binding read
  56
  57 for wrting images RAT1...
  58 for reading images TEX2...
  59   TEX2-RAT1 is paired
  60
  61 TEX2... consumes the same fetch resources, that VTX2... would consume
  62
  63 CONST0 and VTX0 is for parameters
  64   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  65   also constant cached
  66   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  67   the constant cache can handle
  68
  69 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  70 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  71 we should reserve another one too.=> 10 image binding for writing max.
  72
  73 from Nvidia OpenCL:
  74   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  75   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  76
  77 so 10 for writing is enough. 176 is the max for reading according to the docs
  78
  79 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  80 writable images will consume TEX slots, VTX slots too because of linear indexing
  81
  82 */
  83
  84 struct r600_resource* r600_compute_buffer_alloc_vram(
  85        struct r600_screen *screen,
  86        unsigned size)
  87 {
  88         struct pipe_resource * buffer = NULL;
  89         assert(size);
  90
  91         buffer = pipe_buffer_create(
  92                 (struct pipe_screen*) screen,
  93                 PIPE_BIND_CUSTOM,
  94                 PIPE_USAGE_IMMUTABLE,
  95                 size);
  96
  97         return (struct r600_resource *)buffer;
  98 }
  99
 100
 101 static void evergreen_set_rat(
 102         struct r600_pipe_compute *pipe,
 103         int id,
 104         struct r600_resource* bo,
 105         int start,
 106         int size)
 107 {
 108         struct pipe_surface rat_templ;
 109         struct r600_surface *surf = NULL;
 110         struct r600_context *rctx = NULL;
 111
 112         assert(id < 12);
 113         assert((size & 3) == 0);
 114         assert((start & 0xFF) == 0);
 115
 116         rctx = pipe->ctx;
 117
 118         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 119
 120         /* Create the RAT surface */
 121         memset(&rat_templ, 0, sizeof(rat_templ));
 122         rat_templ.format = PIPE_FORMAT_R32_UINT;
 123         rat_templ.u.tex.level = 0;
 124         rat_templ.u.tex.first_layer = 0;
 125         rat_templ.u.tex.last_layer = 0;
 126
 127         /* Add the RAT the list of color buffers */
 128         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 129                 (struct pipe_context *)pipe->ctx,
 130                 (struct pipe_resource *)bo, &rat_templ);
 131
 132         /* Update the number of color buffers */
 133         pipe->ctx->framebuffer.state.nr_cbufs =
 134                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 135
 136         /* Update the cb_target_mask
 137          * XXX: I think this is a potential spot for bugs once we start doing
 138          * GL interop.  cb_target_mask may be modified in the 3D sections
 139          * of this driver. */
 140         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 141
 142         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 143         evergreen_init_color_surface_rat(rctx, surf);
 144 }
 145
 146 static void evergreen_cs_set_vertex_buffer(
 147         struct r600_context * rctx,
 148         unsigned vb_index,
 149         unsigned offset,
 150         struct pipe_resource * buffer)
 151 {
 152         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 153         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 154         vb->stride = 1;
 155         vb->buffer_offset = offset;
 156         vb->buffer = buffer;
 157         vb->user_buffer = NULL;
 158
 159         /* The vertex instructions in the compute shaders use the texture cache,
 160          * so we need to invalidate it. */
 161         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 162         state->enabled_mask |= 1 << vb_index;
 163         state->dirty_mask |= 1 << vb_index;
 164         state->atom.dirty = true;
 165 }
 166
 167 static void evergreen_cs_set_constant_buffer(
 168         struct r600_context * rctx,
 169         unsigned cb_index,
 170         unsigned offset,
 171         unsigned size,
 172         struct pipe_resource * buffer)
 173 {
 174         struct pipe_constant_buffer cb;
 175         cb.buffer_size = size;
 176         cb.buffer_offset = offset;
 177         cb.buffer = buffer;
 178         cb.user_buffer = NULL;
 179
 180         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 181 }
 182
 183 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 184 {
 185         u_default_resource_get_handle, /* get_handle */
 186         r600_compute_global_buffer_destroy, /* resource_destroy */
 187         r600_compute_global_transfer_map, /* transfer_map */
 188         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 189         r600_compute_global_transfer_unmap, /* transfer_unmap */
 190         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 191 };
 192
 193
 194 void *evergreen_create_compute_state(
 195         struct pipe_context *ctx_,
 196         const const struct pipe_compute_state *cso)
 197 {
 198         struct r600_context *ctx = (struct r600_context *)ctx_;
 199         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 200
 201 #ifdef HAVE_OPENCL
 202         const struct pipe_llvm_program_header * header;
 203         const unsigned char * code;
 204         unsigned i;
 205
 206         shader->llvm_ctx = LLVMContextCreate();
 207
 208         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 209
 210         header = cso->prog;
 211         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 212 #endif
 213
 214         shader->ctx = (struct r600_context*)ctx;
 215         shader->local_size = cso->req_local_mem;
 216         shader->private_size = cso->req_private_mem;
 217         shader->input_size = cso->req_input_mem;
 218
 219 #ifdef HAVE_OPENCL
 220         shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx, code,
 221                                                         header->num_bytes);
 222         shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
 223
 224         for (i = 0; i < shader->num_kernels; i++) {
 225                 struct r600_kernel *kernel = &shader->kernels[i];
 226                 kernel->llvm_module = radeon_llvm_get_kernel_module(shader->llvm_ctx, i,
 227                                                         code, header->num_bytes);
 228         }
 229 #endif
 230         return shader;
 231 }
 232
 233 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 234 {
 235         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 236
 237         if (!shader)
 238                 return;
 239
 240         FREE(shader->kernels);
 241
 242 #ifdef HAVE_OPENCL
 243         if (shader->llvm_ctx){
 244                 LLVMContextDispose(shader->llvm_ctx);
 245         }
 246 #endif
 247
 248         FREE(shader);
 249 }
 250
 251 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 252 {
 253         struct r600_context *ctx = (struct r600_context *)ctx_;
 254
 255         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 256
 257         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 258 }
 259
 260 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 261  * kernel parameters there are implicit parameters that need to be stored
 262  * in the vertex buffer as well.  Here is how these parameters are organized in
 263  * the buffer:
 264  *
 265  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 266  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 267  * DWORDS 6-8: Number of work items within each work group in each dimension
 268  *             (x,y,z)
 269  * DWORDS 9+ : Kernel parameters
 270  */
 271 void evergreen_compute_upload_input(
 272         struct pipe_context *ctx_,
 273         const uint *block_layout,
 274         const uint *grid_layout,
 275         const void *input)
 276 {
 277         struct r600_context *ctx = (struct r600_context *)ctx_;
 278         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 279         int i;
 280         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 281          * parameters.
 282          */
 283         unsigned input_size = shader->input_size + 36;
 284         uint32_t * num_work_groups_start;
 285         uint32_t * global_size_start;
 286         uint32_t * local_size_start;
 287         uint32_t * kernel_parameters_start;
 288         struct pipe_box box;
 289         struct pipe_transfer *transfer = NULL;
 290
 291         if (shader->input_size == 0) {
 292                 return;
 293         }
 294
 295         if (!shader->kernel_param) {
 296                 /* Add space for the grid dimensions */
 297                 shader->kernel_param = (struct r600_resource *)
 298                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 299                                         PIPE_USAGE_IMMUTABLE, input_size);
 300         }
 301
 302         u_box_1d(0, input_size, &box);
 303         num_work_groups_start = ctx_->transfer_map(ctx_,
 304                         (struct pipe_resource*)shader->kernel_param,
 305                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 306                         &box, &transfer);
 307         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 308         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 309         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 310
 311         /* Copy the work group size */
 312         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 313
 314         /* Copy the global size */
 315         for (i = 0; i < 3; i++) {
 316                 global_size_start[i] = grid_layout[i] * block_layout[i];
 317         }
 318
 319         /* Copy the local dimensions */
 320         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 321
 322         /* Copy the kernel inputs */
 323         memcpy(kernel_parameters_start, input, shader->input_size);
 324
 325         for (i = 0; i < (input_size / 4); i++) {
 326                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 327                         ((unsigned*)num_work_groups_start)[i]);
 328         }
 329
 330         ctx_->transfer_unmap(ctx_, transfer);
 331
 332         /* ID=0 is reserved for the parameters */
 333         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 334                         (struct pipe_resource*)shader->kernel_param);
 335 }
 336
 337 static void evergreen_emit_direct_dispatch(
 338                 struct r600_context *rctx,
 339                 const uint *block_layout, const uint *grid_layout)
 340 {
 341         int i;
 342         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 343         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 344         unsigned num_waves;
 345         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 346         unsigned wave_divisor = (16 * num_pipes);
 347         int group_size = 1;
 348         int grid_size = 1;
 349         unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
 350
 351         /* Calculate group_size/grid_size */
 352         for (i = 0; i < 3; i++) {
 353                 group_size *= block_layout[i];
 354         }
 355
 356         for (i = 0; i < 3; i++) {
 357                 grid_size *= grid_layout[i];
 358         }
 359
 360         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 361         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 362                         wave_divisor - 1) / wave_divisor;
 363
 364         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 365                                 "%u wavefronts per thread block, "
 366                                 "allocating %u dwords lds.\n",
 367                                 num_pipes, num_waves, lds_size);
 368
 369         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 370
 371         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 372         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 373         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 374         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 375
 376         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 377                                                                 group_size);
 378
 379         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 380         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 381         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 382         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 383
 384         if (rctx->b.chip_class < CAYMAN) {
 385                 assert(lds_size <= 8192);
 386         } else {
 387                 /* Cayman appears to have a slightly smaller limit, see the
 388                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 389                 assert(lds_size <= 8160);
 390         }
 391
 392         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 393                                         lds_size | (num_waves << 14));
 394
 395         /* Dispatch packet */
 396         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 397         radeon_emit(cs, grid_layout[0]);
 398         radeon_emit(cs, grid_layout[1]);
 399         radeon_emit(cs, grid_layout[2]);
 400         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 401         radeon_emit(cs, 1);
 402 }
 403
 404 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 405                 const uint *grid_layout)
 406 {
 407         struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 408         int i;
 409
 410         /* make sure that the gfx ring is only one active */
 411         if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
 412                 ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 413         }
 414
 415         /* Initialize all the compute-related registers.
 416          *
 417          * See evergreen_init_atom_start_compute_cs() in this file for the list
 418          * of registers initialized by the start_compute_cs_cmd atom.
 419          */
 420         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 421
 422         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 423         r600_flush_emit(ctx);
 424
 425         /* Emit colorbuffers. */
 426         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 427         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 428                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 429                 unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
 430                                                        (struct r600_resource*)cb->base.texture,
 431                                                        RADEON_USAGE_READWRITE,
 432                                                        RADEON_PRIO_SHADER_RESOURCE_RW);
 433
 434                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 435                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 436                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 437                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 438                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 439                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 440                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 441                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 442
 443                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 444                 radeon_emit(cs, reloc);
 445
 446                 if (!ctx->keep_tiling_flags) {
 447                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 448                         radeon_emit(cs, reloc);
 449                 }
 450
 451                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 452                 radeon_emit(cs, reloc);
 453         }
 454         if (ctx->keep_tiling_flags) {
 455                 for (; i < 8 ; i++) {
 456                         r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 457                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 458                 }
 459                 for (; i < 12; i++) {
 460                         r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 461                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 462                 }
 463         }
 464
 465         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 466         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 467                                         ctx->compute_cb_target_mask);
 468
 469
 470         /* Emit vertex buffer state */
 471         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 472         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 473
 474         /* Emit constant buffer state */
 475         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 476
 477         /* Emit compute shader state */
 478         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 479
 480         /* Emit dispatch state and dispatch packet */
 481         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 482
 483         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 484          */
 485         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 486                       R600_CONTEXT_INV_VERTEX_CACHE |
 487                       R600_CONTEXT_INV_TEX_CACHE;
 488         r600_flush_emit(ctx);
 489         ctx->b.flags = 0;
 490
 491         if (ctx->b.chip_class >= CAYMAN) {
 492                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 493                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 494                 /* DEALLOC_STATE prevents the GPU from hanging when a
 495                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 496                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 497                  */
 498                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 499                 cs->buf[cs->cdw++] = 0;
 500         }
 501
 502 #if 0
 503         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 504         for (i = 0; i < cs->cdw; i++) {
 505                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 506         }
 507 #endif
 508
 509 }
 510
 511
 512 /**
 513  * Emit function for r600_cs_shader_state atom
 514  */
 515 void evergreen_emit_cs_shader(
 516                 struct r600_context *rctx,
 517                 struct r600_atom *atom)
 518 {
 519         struct r600_cs_shader_state *state =
 520                                         (struct r600_cs_shader_state*)atom;
 521         struct r600_pipe_compute *shader = state->shader;
 522         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 523         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 524         uint64_t va;
 525
 526         va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b);
 527
 528         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 529         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 530         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 531                         S_0288D4_NUM_GPRS(kernel->bc.ngpr)
 532                         | S_0288D4_STACK_SIZE(kernel->bc.nstack));
 533         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 534
 535         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 536         radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
 537                                               kernel->code_bo, RADEON_USAGE_READ,
 538                                               RADEON_PRIO_SHADER_DATA));
 539 }
 540
 541 static void evergreen_launch_grid(
 542                 struct pipe_context *ctx_,
 543                 const uint *block_layout, const uint *grid_layout,
 544                 uint32_t pc, const void *input)
 545 {
 546         struct r600_context *ctx = (struct r600_context *)ctx_;
 547
 548         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 549         struct r600_kernel *kernel = &shader->kernels[pc];
 550
 551         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 552
 553 #ifdef HAVE_OPENCL
 554
 555         if (!kernel->code_bo) {
 556                 void *p;
 557                 struct r600_bytecode *bc = &kernel->bc;
 558                 LLVMModuleRef mod = kernel->llvm_module;
 559                 boolean use_kill = false;
 560                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 561                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 562                 unsigned sb_disasm = use_sb ||
 563                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 564
 565                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 566                            ctx->screen->has_compressed_msaa_texturing);
 567                 bc->type = TGSI_PROCESSOR_COMPUTE;
 568                 bc->isa = ctx->isa;
 569                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 570
 571                 if (dump && !sb_disasm) {
 572                         r600_bytecode_disasm(bc);
 573                 } else if ((dump && sb_disasm) || use_sb) {
 574                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 575                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 576                 }
 577
 578                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 579                                                         kernel->bc.ndw * 4);
 580                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 581                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 582                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 583         }
 584 #endif
 585         shader->active_kernel = kernel;
 586         ctx->cs_shader_state.kernel_index = pc;
 587         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 588         compute_emit_cs(ctx, block_layout, grid_layout);
 589 }
 590
 591 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 592                 unsigned start, unsigned count,
 593                 struct pipe_surface ** surfaces)
 594 {
 595         struct r600_context *ctx = (struct r600_context *)ctx_;
 596         struct r600_surface **resources = (struct r600_surface **)surfaces;
 597
 598         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 599                         start, count);
 600
 601         for (int i = 0; i < count; i++) {
 602                 /* The First two vertex buffers are reserved for parameters and
 603                  * global buffers. */
 604                 unsigned vtx_id = 2 + i;
 605                 if (resources[i]) {
 606                         struct r600_resource_global *buffer =
 607                                 (struct r600_resource_global*)
 608                                 resources[i]->base.texture;
 609                         if (resources[i]->base.writable) {
 610                                 assert(i+1 < 12);
 611
 612                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 613                                 (struct r600_resource *)resources[i]->base.texture,
 614                                 buffer->chunk->start_in_dw*4,
 615                                 resources[i]->base.texture->width0);
 616                         }
 617
 618                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 619                                         buffer->chunk->start_in_dw * 4,
 620                                         resources[i]->base.texture);
 621                 }
 622         }
 623 }
 624
 625 void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 626                 unsigned start_slot, unsigned count,
 627                 struct pipe_sampler_view **views)
 628 {
 629         struct r600_pipe_sampler_view **resource =
 630                 (struct r600_pipe_sampler_view **)views;
 631
 632         for (int i = 0; i < count; i++) {
 633                 if (resource[i]) {
 634                         assert(i+1 < 12);
 635                         /* XXX: Implement */
 636                         assert(!"Compute samplers not implemented.");
 637                         ///FETCH0 = VTX0 (param buffer),
 638                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 639                 }
 640         }
 641 }
 642
 643
 644 static void evergreen_set_global_binding(
 645         struct pipe_context *ctx_, unsigned first, unsigned n,
 646         struct pipe_resource **resources,
 647         uint32_t **handles)
 648 {
 649         struct r600_context *ctx = (struct r600_context *)ctx_;
 650         struct compute_memory_pool *pool = ctx->screen->global_pool;
 651         struct r600_resource_global **buffers =
 652                 (struct r600_resource_global **)resources;
 653
 654         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 655                         first, n);
 656
 657         if (!resources) {
 658                 /* XXX: Unset */
 659                 return;
 660         }
 661
 662         /* We mark these items for promotion to the pool if they
 663          * aren't already there */
 664         for (int i = 0; i < n; i++) {
 665                 struct compute_memory_item *item = buffers[i]->chunk;
 666
 667                 if (!is_item_in_pool(item))
 668                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 669         }
 670
 671         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 672                 /* XXX: Unset */
 673                 return;
 674         }
 675
 676         for (int i = 0; i < n; i++)
 677         {
 678                 uint32_t buffer_offset;
 679                 uint32_t handle;
 680                 assert(resources[i]->target == PIPE_BUFFER);
 681                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 682
 683                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 684                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 685
 686                 *(handles[i]) = util_cpu_to_le32(handle);
 687         }
 688
 689         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 690         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 691                                 (struct pipe_resource*)pool->bo);
 692 }
 693
 694 /**
 695  * This function initializes all the compute specific registers that need to
 696  * be initialized for each compute command stream.  Registers that are common
 697  * to both compute and 3D will be initialized at the beginning of each compute
 698  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 699  * packet requires that the shader type bit be set, we must initialize all
 700  * context registers needed for compute in this function.  The registers
 701  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 702  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 703  * on the GPU family.
 704  */
 705 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 706 {
 707         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 708         int num_threads;
 709         int num_stack_entries;
 710
 711         /* since all required registers are initialised in the
 712          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 713          */
 714         r600_init_command_buffer(cb, 256);
 715         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 716
 717         /* This must be first. */
 718         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 719         r600_store_value(cb, 0x80000000);
 720         r600_store_value(cb, 0x80000000);
 721
 722         /* We're setting config registers here. */
 723         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 724         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 725
 726         switch (ctx->b.family) {
 727         case CHIP_CEDAR:
 728         default:
 729                 num_threads = 128;
 730                 num_stack_entries = 256;
 731                 break;
 732         case CHIP_REDWOOD:
 733                 num_threads = 128;
 734                 num_stack_entries = 256;
 735                 break;
 736         case CHIP_JUNIPER:
 737                 num_threads = 128;
 738                 num_stack_entries = 512;
 739                 break;
 740         case CHIP_CYPRESS:
 741         case CHIP_HEMLOCK:
 742                 num_threads = 128;
 743                 num_stack_entries = 512;
 744                 break;
 745         case CHIP_PALM:
 746                 num_threads = 128;
 747                 num_stack_entries = 256;
 748                 break;
 749         case CHIP_SUMO:
 750                 num_threads = 128;
 751                 num_stack_entries = 256;
 752                 break;
 753         case CHIP_SUMO2:
 754                 num_threads = 128;
 755                 num_stack_entries = 512;
 756                 break;
 757         case CHIP_BARTS:
 758                 num_threads = 128;
 759                 num_stack_entries = 512;
 760                 break;
 761         case CHIP_TURKS:
 762                 num_threads = 128;
 763                 num_stack_entries = 256;
 764                 break;
 765         case CHIP_CAICOS:
 766                 num_threads = 128;
 767                 num_stack_entries = 256;
 768                 break;
 769         }
 770
 771         /* Config Registers */
 772         if (ctx->b.chip_class < CAYMAN)
 773                 evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 774                                            ctx->screen->b.info.drm_minor);
 775         else
 776                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 777                                         ctx->screen->b.info.drm_minor);
 778
 779         /* The primitive type always needs to be POINTLIST for compute. */
 780         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 781                                                 V_008958_DI_PT_POINTLIST);
 782
 783         if (ctx->b.chip_class < CAYMAN) {
 784
 785                 /* These registers control which simds can be used by each stage.
 786                  * The default for these registers is 0xffffffff, which means
 787                  * all simds are available for each stage.  It's possible we may
 788                  * want to play around with these in the future, but for now
 789                  * the default value is fine.
 790                  *
 791                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 792                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 793                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 794                  */
 795
 796                 /* XXX: We may need to adjust the thread and stack resouce
 797                  * values for 3D/compute interop */
 798
 799                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 800
 801                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 802                  * Set the number of threads used by the PS/VS/GS/ES stage to
 803                  * 0.
 804                  */
 805                 r600_store_value(cb, 0);
 806
 807                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 808                  * Set the number of threads used by the CS (aka LS) stage to
 809                  * the maximum number of threads and set the number of threads
 810                  * for the HS stage to 0. */
 811                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 812
 813                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 814                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 815                 r600_store_value(cb, 0);
 816
 817                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 818                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 819                 r600_store_value(cb, 0);
 820
 821                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 822                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 823                  * set it to the maximum value for the CS (aka LS) stage. */
 824                 r600_store_value(cb,
 825                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 826         }
 827         /* Give the compute shader all the available LDS space.
 828          * NOTE: This only sets the maximum number of dwords that a compute
 829          * shader can allocate.  When a shader is executed, we still need to
 830          * allocate the appropriate amount of LDS dwords using the
 831          * CM_R_0288E8_SQ_LDS_ALLOC register.
 832          */
 833         if (ctx->b.chip_class < CAYMAN) {
 834                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 835                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 836         } else {
 837                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 838                         S_0286FC_NUM_PS_LDS(0) |
 839                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 840         }
 841
 842         /* Context Registers */
 843
 844         if (ctx->b.chip_class < CAYMAN) {
 845                 /* workaround for hw issues with dyn gpr - must set all limits
 846                  * to 240 instead of 0, 0x1e == 240 / 8
 847                  */
 848                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 849                                 S_028838_PS_GPRS(0x1e) |
 850                                 S_028838_VS_GPRS(0x1e) |
 851                                 S_028838_GS_GPRS(0x1e) |
 852                                 S_028838_ES_GPRS(0x1e) |
 853                                 S_028838_HS_GPRS(0x1e) |
 854                                 S_028838_LS_GPRS(0x1e));
 855         }
 856
 857         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 858         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 859                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 860
 861         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 862
 863         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 864                                                 S_0286E8_TID_IN_GROUP_ENA
 865                                                 | S_0286E8_TGID_ENA
 866                                                 | S_0286E8_DISABLE_INDEX_PACK)
 867                                                 ;
 868
 869         /* The LOOP_CONST registers are an optimizations for loops that allows
 870          * you to store the initial counter, increment value, and maximum
 871          * counter value in a register so that hardware can calculate the
 872          * correct number of iterations for the loop, so that you don't need
 873          * to have the loop counter in your shader code.  We don't currently use
 874          * this optimization, so we must keep track of the counter in the
 875          * shader and use a break instruction to exit loops.  However, the
 876          * hardware will still uses this register to determine when to exit a
 877          * loop, so we need to initialize the counter to 0, set the increment
 878          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 879          * is the maximum value allowed.  This gives us a maximum of 4096
 880          * iterations for our loops, but hopefully our break instruction will
 881          * execute before some time before the 4096th iteration.
 882          */
 883         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 884 }
 885
 886 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 887 {
 888         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 889         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 890         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 891 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 892         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 893         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 894         ctx->b.b.launch_grid = evergreen_launch_grid;
 895
 896         /* We always use at least one vertex buffer for parameters (id = 1)*/
 897         ctx->cs_vertex_buffer_state.enabled_mask =
 898         ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
 899 }
 900
 901 struct pipe_resource *r600_compute_global_buffer_create(
 902         struct pipe_screen *screen,
 903         const struct pipe_resource *templ)
 904 {
 905         struct r600_resource_global* result = NULL;
 906         struct r600_screen* rscreen = NULL;
 907         int size_in_dw = 0;
 908
 909         assert(templ->target == PIPE_BUFFER);
 910         assert(templ->bind & PIPE_BIND_GLOBAL);
 911         assert(templ->array_size == 1 || templ->array_size == 0);
 912         assert(templ->depth0 == 1 || templ->depth0 == 0);
 913         assert(templ->height0 == 1 || templ->height0 == 0);
 914
 915         result = (struct r600_resource_global*)
 916         CALLOC(sizeof(struct r600_resource_global), 1);
 917         rscreen = (struct r600_screen*)screen;
 918
 919         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 920         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 921                         templ->array_size);
 922
 923         result->base.b.vtbl = &r600_global_buffer_vtbl;
 924         result->base.b.b.screen = screen;
 925         result->base.b.b = *templ;
 926         pipe_reference_init(&result->base.b.b.reference, 1);
 927
 928         size_in_dw = (templ->width0+3) / 4;
 929
 930         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 931
 932         if (result->chunk == NULL)
 933         {
 934                 free(result);
 935                 return NULL;
 936         }
 937
 938         return &result->base.b.b;
 939 }
 940
 941 void r600_compute_global_buffer_destroy(
 942         struct pipe_screen *screen,
 943         struct pipe_resource *res)
 944 {
 945         struct r600_resource_global* buffer = NULL;
 946         struct r600_screen* rscreen = NULL;
 947
 948         assert(res->target == PIPE_BUFFER);
 949         assert(res->bind & PIPE_BIND_GLOBAL);
 950
 951         buffer = (struct r600_resource_global*)res;
 952         rscreen = (struct r600_screen*)screen;
 953
 954         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 955
 956         buffer->chunk = NULL;
 957         free(res);
 958 }
 959
 960 void *r600_compute_global_transfer_map(
 961         struct pipe_context *ctx_,
 962         struct pipe_resource *resource,
 963         unsigned level,
 964         unsigned usage,
 965         const struct pipe_box *box,
 966         struct pipe_transfer **ptransfer)
 967 {
 968         struct r600_context *rctx = (struct r600_context*)ctx_;
 969         struct compute_memory_pool *pool = rctx->screen->global_pool;
 970         struct r600_resource_global* buffer =
 971                 (struct r600_resource_global*)resource;
 972
 973         struct pipe_resource *dst;
 974         unsigned offset = box->x;
 975
 976         if (is_item_in_pool(buffer->chunk)) {
 977                 compute_memory_demote_item(pool, buffer->chunk, ctx_);
 978         }
 979
 980         dst = (struct pipe_resource*)buffer->chunk->real_buffer;
 981
 982         if (usage & PIPE_TRANSFER_READ)
 983                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 984
 985         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 986                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 987                         "width = %u, height = %u, depth = %u)\n", level, usage,
 988                         box->x, box->y, box->z, box->width, box->height,
 989                         box->depth);
 990         COMPUTE_DBG(rctx->screen, "Buffer id = %u offset = "
 991                 "%u (box.x)\n", buffer->chunk->id, box->x);
 992
 993
 994         assert(resource->target == PIPE_BUFFER);
 995         assert(resource->bind & PIPE_BIND_GLOBAL);
 996         assert(box->x >= 0);
 997         assert(box->y == 0);
 998         assert(box->z == 0);
 999
1000         ///TODO: do it better, mapping is not possible if the pool is too big
1001         return pipe_buffer_map_range(ctx_, dst,
1002                         offset, box->width, usage, ptransfer);
1003 }
1004
1005 void r600_compute_global_transfer_unmap(
1006         struct pipe_context *ctx_,
1007         struct pipe_transfer* transfer)
1008 {
1009         /* struct r600_resource_global are not real resources, they just map
1010          * to an offset within the compute memory pool.  The function
1011          * r600_compute_global_transfer_map() maps the memory pool
1012          * resource rather than the struct r600_resource_global passed to
1013          * it as an argument and then initalizes ptransfer->resource with
1014          * the memory pool resource (via pipe_buffer_map_range).
1015          * When transfer_unmap is called it uses the memory pool's
1016          * vtable which calls r600_buffer_transfer_map() rather than
1017          * this function.
1018          */
1019         assert (!"This function should not be called");
1020 }
1021
1022 void r600_compute_global_transfer_flush_region(
1023         struct pipe_context *ctx_,
1024         struct pipe_transfer *transfer,
1025         const struct pipe_box *box)
1026 {
1027         assert(0 && "TODO");
1028 }
1029
1030 void r600_compute_global_transfer_inline_write(
1031         struct pipe_context *pipe,
1032         struct pipe_resource *resource,
1033         unsigned level,
1034         unsigned usage,
1035         const struct pipe_box *box,
1036         const void *data,
1037         unsigned stride,
1038         unsigned layer_stride)
1039 {
1040         assert(0 && "TODO");
1041 }