src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_resource.h"
  43 #include "r600_shader.h"
  44 #include "r600_pipe.h"
  45 #include "r600_formats.h"
  46 #include "evergreen_compute.h"
  47 #include "evergreen_compute_internal.h"
  48 #include "compute_memory_pool.h"
  49 #include "sb/sb_public.h"
  50 #ifdef HAVE_OPENCL
  51 #include "radeon_llvm_util.h"
  52 #endif
  53
  54 /**
  55 RAT0 is for global binding write
  56 VTX1 is for global binding read
  57
  58 for wrting images RAT1...
  59 for reading images TEX2...
  60   TEX2-RAT1 is paired
  61
  62 TEX2... consumes the same fetch resources, that VTX2... would consume
  63
  64 CONST0 and VTX0 is for parameters
  65   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  66   also constant cached
  67   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  68   the constant cache can handle
  69
  70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  72 we should reserve another one too.=> 10 image binding for writing max.
  73
  74 from Nvidia OpenCL:
  75   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  76   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  77
  78 so 10 for writing is enough. 176 is the max for reading according to the docs
  79
  80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  81 writable images will consume TEX slots, VTX slots too because of linear indexing
  82
  83 */
  84
  85 struct r600_resource* r600_compute_buffer_alloc_vram(
  86        struct r600_screen *screen,
  87        unsigned size)
  88 {
  89         struct pipe_resource * buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create(
  93                 (struct pipe_screen*) screen,
  94                 PIPE_BIND_CUSTOM,
  95                 PIPE_USAGE_IMMUTABLE,
  96                 size);
  97
  98         return (struct r600_resource *)buffer;
  99 }
 100
 101
 102 static void evergreen_set_rat(
 103         struct r600_pipe_compute *pipe,
 104         int id,
 105         struct r600_resource* bo,
 106         int start,
 107         int size)
 108 {
 109         struct pipe_surface rat_templ;
 110         struct r600_surface *surf = NULL;
 111         struct r600_context *rctx = NULL;
 112
 113         assert(id < 12);
 114         assert((size & 3) == 0);
 115         assert((start & 0xFF) == 0);
 116
 117         rctx = pipe->ctx;
 118
 119         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 120
 121         /* Create the RAT surface */
 122         memset(&rat_templ, 0, sizeof(rat_templ));
 123         rat_templ.format = PIPE_FORMAT_R32_UINT;
 124         rat_templ.u.tex.level = 0;
 125         rat_templ.u.tex.first_layer = 0;
 126         rat_templ.u.tex.last_layer = 0;
 127
 128         /* Add the RAT the list of color buffers */
 129         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->context.create_surface(
 130                 (struct pipe_context *)pipe->ctx,
 131                 (struct pipe_resource *)bo, &rat_templ);
 132
 133         /* Update the number of color buffers */
 134         pipe->ctx->framebuffer.state.nr_cbufs =
 135                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 136
 137         /* Update the cb_target_mask
 138          * XXX: I think this is a potential spot for bugs once we start doing
 139          * GL interop.  cb_target_mask may be modified in the 3D sections
 140          * of this driver. */
 141         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 142
 143         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 144         evergreen_init_color_surface_rat(rctx, surf);
 145 }
 146
 147 static void evergreen_cs_set_vertex_buffer(
 148         struct r600_context * rctx,
 149         unsigned vb_index,
 150         unsigned offset,
 151         struct pipe_resource * buffer)
 152 {
 153         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 154         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 155         vb->stride = 1;
 156         vb->buffer_offset = offset;
 157         vb->buffer = buffer;
 158         vb->user_buffer = NULL;
 159
 160         /* The vertex instructions in the compute shaders use the texture cache,
 161          * so we need to invalidate it. */
 162         rctx->flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 163         state->enabled_mask |= 1 << vb_index;
 164         state->dirty_mask |= 1 << vb_index;
 165         state->atom.dirty = true;
 166 }
 167
 168 static void evergreen_cs_set_constant_buffer(
 169         struct r600_context * rctx,
 170         unsigned cb_index,
 171         unsigned offset,
 172         unsigned size,
 173         struct pipe_resource * buffer)
 174 {
 175         struct pipe_constant_buffer cb;
 176         cb.buffer_size = size;
 177         cb.buffer_offset = offset;
 178         cb.buffer = buffer;
 179         cb.user_buffer = NULL;
 180
 181         rctx->context.set_constant_buffer(&rctx->context, PIPE_SHADER_COMPUTE, cb_index, &cb);
 182 }
 183
 184 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 185 {
 186         u_default_resource_get_handle, /* get_handle */
 187         r600_compute_global_buffer_destroy, /* resource_destroy */
 188         r600_compute_global_transfer_map, /* transfer_map */
 189         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 190         r600_compute_global_transfer_unmap, /* transfer_unmap */
 191         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 192 };
 193
 194
 195 void *evergreen_create_compute_state(
 196         struct pipe_context *ctx_,
 197         const const struct pipe_compute_state *cso)
 198 {
 199         struct r600_context *ctx = (struct r600_context *)ctx_;
 200         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 201
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const unsigned char * code;
 205         unsigned i;
 206
 207         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 208
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211 #endif
 212
 213         shader->ctx = (struct r600_context*)ctx;
 214         /* XXX: We ignore cso->req_local_mem, because we compute this value
 215          * ourselves on a per-kernel basis. */
 216         shader->private_size = cso->req_private_mem;
 217         shader->input_size = cso->req_input_mem;
 218
 219 #ifdef HAVE_OPENCL
 220         shader->num_kernels = radeon_llvm_get_num_kernels(code, header->num_bytes);
 221         shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
 222
 223         for (i = 0; i < shader->num_kernels; i++) {
 224                 struct r600_kernel *kernel = &shader->kernels[i];
 225                 kernel->llvm_module = radeon_llvm_get_kernel_module(i, code,
 226                                                         header->num_bytes);
 227         }
 228 #endif
 229         return shader;
 230 }
 231
 232 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 233 {
 234         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 235
 236         free(shader);
 237 }
 238
 239 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 240 {
 241         struct r600_context *ctx = (struct r600_context *)ctx_;
 242
 243         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 244
 245         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 246 }
 247
 248 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 249  * kernel parameters there are inplicit parameters that need to be stored
 250  * in the vertex buffer as well.  Here is how these parameters are organized in
 251  * the buffer:
 252  *
 253  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 254  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 255  * DWORDS 6-8: Number of work items within each work group in each dimension
 256  *             (x,y,z)
 257  * DWORDS 9+ : Kernel parameters
 258  */
 259 void evergreen_compute_upload_input(
 260         struct pipe_context *ctx_,
 261         const uint *block_layout,
 262         const uint *grid_layout,
 263         const void *input)
 264 {
 265         struct r600_context *ctx = (struct r600_context *)ctx_;
 266         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 267         int i;
 268         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 269          * parameters.
 270          */
 271         unsigned input_size = shader->input_size + 36;
 272         uint32_t * num_work_groups_start;
 273         uint32_t * global_size_start;
 274         uint32_t * local_size_start;
 275         uint32_t * kernel_parameters_start;
 276         struct pipe_box box;
 277         struct pipe_transfer *transfer = NULL;
 278
 279         if (shader->input_size == 0) {
 280                 return;
 281         }
 282
 283         if (!shader->kernel_param) {
 284                 /* Add space for the grid dimensions */
 285                 shader->kernel_param = (struct r600_resource *)
 286                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 287                                         PIPE_USAGE_IMMUTABLE, input_size);
 288         }
 289
 290         u_box_1d(0, input_size, &box);
 291         num_work_groups_start = ctx_->transfer_map(ctx_,
 292                         (struct pipe_resource*)shader->kernel_param,
 293                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 294                         &box, &transfer);
 295         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 296         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 297         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 298
 299         /* Copy the work group size */
 300         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 301
 302         /* Copy the global size */
 303         for (i = 0; i < 3; i++) {
 304                 global_size_start[i] = grid_layout[i] * block_layout[i];
 305         }
 306
 307         /* Copy the local dimensions */
 308         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 309
 310         /* Copy the kernel inputs */
 311         memcpy(kernel_parameters_start, input, shader->input_size);
 312
 313         for (i = 0; i < (input_size / 4); i++) {
 314                 COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
 315                         ((unsigned*)num_work_groups_start)[i]);
 316         }
 317
 318         ctx_->transfer_unmap(ctx_, transfer);
 319
 320         /* ID=0 is reserved for the parameters */
 321         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 322                         (struct pipe_resource*)shader->kernel_param);
 323 }
 324
 325 static void evergreen_emit_direct_dispatch(
 326                 struct r600_context *rctx,
 327                 const uint *block_layout, const uint *grid_layout)
 328 {
 329         int i;
 330         struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
 331         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 332         unsigned num_waves;
 333         unsigned num_pipes = rctx->screen->info.r600_max_pipes;
 334         unsigned wave_divisor = (16 * num_pipes);
 335         int group_size = 1;
 336         int grid_size = 1;
 337         unsigned lds_size = shader->active_kernel->bc.nlds_dw;
 338
 339         /* Calculate group_size/grid_size */
 340         for (i = 0; i < 3; i++) {
 341                 group_size *= block_layout[i];
 342         }
 343
 344         for (i = 0; i < 3; i++) {
 345                 grid_size *= grid_layout[i];
 346         }
 347
 348         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 349         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 350                         wave_divisor - 1) / wave_divisor;
 351
 352         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 353                                 "%u wavefronts per thread block, "
 354                                 "allocating %u dwords lds.\n",
 355                                 num_pipes, num_waves, lds_size);
 356
 357         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 358
 359         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 360         r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 361         r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 362         r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 363
 364         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 365                                                                 group_size);
 366
 367         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 368         r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 369         r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 370         r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 371
 372         if (rctx->chip_class < CAYMAN) {
 373                 assert(lds_size <= 8192);
 374         } else {
 375                 /* Cayman appears to have a slightly smaller limit, see the
 376                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 377                 assert(lds_size <= 8160);
 378         }
 379
 380         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 381                                         lds_size | (num_waves << 14));
 382
 383         /* Dispatch packet */
 384         r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 385         r600_write_value(cs, grid_layout[0]);
 386         r600_write_value(cs, grid_layout[1]);
 387         r600_write_value(cs, grid_layout[2]);
 388         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 389         r600_write_value(cs, 1);
 390 }
 391
 392 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 393                 const uint *grid_layout)
 394 {
 395         struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
 396         unsigned flush_flags = 0;
 397         int i;
 398
 399         /* make sure that the gfx ring is only one active */
 400         if (ctx->rings.dma.cs) {
 401                 ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
 402         }
 403
 404         /* Initialize all the compute-related registers.
 405          *
 406          * See evergreen_init_atom_start_compute_cs() in this file for the list
 407          * of registers initialized by the start_compute_cs_cmd atom.
 408          */
 409         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 410
 411         ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 412         r600_flush_emit(ctx);
 413
 414         /* Emit colorbuffers. */
 415         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 416         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 417                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 418                 unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx,
 419                                                        (struct r600_resource*)cb->base.texture,
 420                                                        RADEON_USAGE_READWRITE);
 421
 422                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 423                 r600_write_value(cs, cb->cb_color_base);        /* R_028C60_CB_COLOR0_BASE */
 424                 r600_write_value(cs, cb->cb_color_pitch);       /* R_028C64_CB_COLOR0_PITCH */
 425                 r600_write_value(cs, cb->cb_color_slice);       /* R_028C68_CB_COLOR0_SLICE */
 426                 r600_write_value(cs, cb->cb_color_view);        /* R_028C6C_CB_COLOR0_VIEW */
 427                 r600_write_value(cs, cb->cb_color_info);        /* R_028C70_CB_COLOR0_INFO */
 428                 r600_write_value(cs, cb->cb_color_attrib);      /* R_028C74_CB_COLOR0_ATTRIB */
 429                 r600_write_value(cs, cb->cb_color_dim);         /* R_028C78_CB_COLOR0_DIM */
 430
 431                 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 432                 r600_write_value(cs, reloc);
 433
 434                 if (!ctx->keep_tiling_flags) {
 435                         r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 436                         r600_write_value(cs, reloc);
 437                 }
 438
 439                 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 440                 r600_write_value(cs, reloc);
 441         }
 442         if (ctx->keep_tiling_flags) {
 443                 for (; i < 8 ; i++) {
 444                         r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 445                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 446                 }
 447                 for (; i < 12; i++) {
 448                         r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 449                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 450                 }
 451         }
 452
 453         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 454         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 455                                         ctx->compute_cb_target_mask);
 456
 457
 458         /* Emit vertex buffer state */
 459         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 460         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 461
 462         /* Emit constant buffer state */
 463         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 464
 465         /* Emit compute shader state */
 466         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 467
 468         /* Emit dispatch state and dispatch packet */
 469         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 470
 471         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 472          */
 473         ctx->flags |= R600_CONTEXT_INV_CONST_CACHE |
 474                       R600_CONTEXT_INV_VERTEX_CACHE |
 475                       R600_CONTEXT_INV_TEX_CACHE;
 476         r600_flush_emit(ctx);
 477
 478 #if 0
 479         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 480         for (i = 0; i < cs->cdw; i++) {
 481                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 482         }
 483 #endif
 484
 485         flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
 486         if (ctx->keep_tiling_flags) {
 487                 flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 488         }
 489
 490         ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags, ctx->screen->cs_count++);
 491
 492         ctx->flags = 0;
 493
 494         COMPUTE_DBG(ctx->screen, "shader started\n");
 495 }
 496
 497
 498 /**
 499  * Emit function for r600_cs_shader_state atom
 500  */
 501 void evergreen_emit_cs_shader(
 502                 struct r600_context *rctx,
 503                 struct r600_atom *atom)
 504 {
 505         struct r600_cs_shader_state *state =
 506                                         (struct r600_cs_shader_state*)atom;
 507         struct r600_pipe_compute *shader = state->shader;
 508         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 509         struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
 510         uint64_t va;
 511
 512         va = r600_resource_va(&rctx->screen->screen, &kernel->code_bo->b.b);
 513
 514         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 515         r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 516         r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 517                         S_0288D4_NUM_GPRS(kernel->bc.ngpr)
 518                         | S_0288D4_STACK_SIZE(kernel->bc.nstack));
 519         r600_write_value(cs, 0);        /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 520
 521         r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
 522         r600_write_value(cs, r600_context_bo_reloc(rctx, &rctx->rings.gfx,
 523                                                         kernel->code_bo, RADEON_USAGE_READ));
 524 }
 525
 526 static void evergreen_launch_grid(
 527                 struct pipe_context *ctx_,
 528                 const uint *block_layout, const uint *grid_layout,
 529                 uint32_t pc, const void *input)
 530 {
 531         struct r600_context *ctx = (struct r600_context *)ctx_;
 532
 533         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 534         struct r600_kernel *kernel = &shader->kernels[pc];
 535
 536         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 537
 538 #ifdef HAVE_OPENCL
 539
 540         if (!kernel->code_bo) {
 541                 void *p;
 542                 struct r600_bytecode *bc = &kernel->bc;
 543                 LLVMModuleRef mod = kernel->llvm_module;
 544                 boolean use_kill = false;
 545                 bool dump = (ctx->screen->debug_flags & DBG_CS) != 0;
 546                 unsigned use_sb = ctx->screen->debug_flags & DBG_SB_CS;
 547                 unsigned sb_disasm = use_sb ||
 548                         (ctx->screen->debug_flags & DBG_SB_DISASM);
 549
 550                 r600_bytecode_init(bc, ctx->chip_class, ctx->family,
 551                            ctx->screen->has_compressed_msaa_texturing);
 552                 bc->type = TGSI_PROCESSOR_COMPUTE;
 553                 bc->isa = ctx->isa;
 554                 r600_llvm_compile(mod, ctx->family, bc, &use_kill, dump);
 555
 556                 if (dump && !sb_disasm) {
 557                         r600_bytecode_disasm(bc);
 558                 } else if ((dump && sb_disasm) || use_sb) {
 559                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 560                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 561                 }
 562
 563                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 564                                                         kernel->bc.ndw * 4);
 565                 p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
 566                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 567                 ctx->ws->buffer_unmap(kernel->code_bo->cs_buf);
 568         }
 569 #endif
 570         shader->active_kernel = kernel;
 571         ctx->cs_shader_state.kernel_index = pc;
 572         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 573         compute_emit_cs(ctx, block_layout, grid_layout);
 574 }
 575
 576 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 577                 unsigned start, unsigned count,
 578                 struct pipe_surface ** surfaces)
 579 {
 580         struct r600_context *ctx = (struct r600_context *)ctx_;
 581         struct r600_surface **resources = (struct r600_surface **)surfaces;
 582
 583         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 584                         start, count);
 585
 586         for (int i = 0; i < count; i++) {
 587                 /* The First two vertex buffers are reserved for parameters and
 588                  * global buffers. */
 589                 unsigned vtx_id = 2 + i;
 590                 if (resources[i]) {
 591                         struct r600_resource_global *buffer =
 592                                 (struct r600_resource_global*)
 593                                 resources[i]->base.texture;
 594                         if (resources[i]->base.writable) {
 595                                 assert(i+1 < 12);
 596
 597                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 598                                 (struct r600_resource *)resources[i]->base.texture,
 599                                 buffer->chunk->start_in_dw*4,
 600                                 resources[i]->base.texture->width0);
 601                         }
 602
 603                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 604                                         buffer->chunk->start_in_dw * 4,
 605                                         resources[i]->base.texture);
 606                 }
 607         }
 608 }
 609
 610 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 611                 unsigned start_slot, unsigned count,
 612                 struct pipe_sampler_view **views)
 613 {
 614         struct r600_pipe_sampler_view **resource =
 615                 (struct r600_pipe_sampler_view **)views;
 616
 617         for (int i = 0; i < count; i++) {
 618                 if (resource[i]) {
 619                         assert(i+1 < 12);
 620                         /* XXX: Implement */
 621                         assert(!"Compute samplers not implemented.");
 622                         ///FETCH0 = VTX0 (param buffer),
 623                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 624                 }
 625         }
 626 }
 627
 628 static void evergreen_bind_compute_sampler_states(
 629         struct pipe_context *ctx_,
 630         unsigned start_slot,
 631         unsigned num_samplers,
 632         void **samplers_)
 633 {
 634         struct compute_sampler_state ** samplers =
 635                 (struct compute_sampler_state **)samplers_;
 636
 637         for (int i = 0; i < num_samplers; i++) {
 638                 if (samplers[i]) {
 639                         /* XXX: Implement */
 640                         assert(!"Compute samplers not implemented.");
 641                 }
 642         }
 643 }
 644
 645 static void evergreen_set_global_binding(
 646         struct pipe_context *ctx_, unsigned first, unsigned n,
 647         struct pipe_resource **resources,
 648         uint32_t **handles)
 649 {
 650         struct r600_context *ctx = (struct r600_context *)ctx_;
 651         struct compute_memory_pool *pool = ctx->screen->global_pool;
 652         struct r600_resource_global **buffers =
 653                 (struct r600_resource_global **)resources;
 654
 655         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 656                         first, n);
 657
 658         if (!resources) {
 659                 /* XXX: Unset */
 660                 return;
 661         }
 662
 663         compute_memory_finalize_pending(pool, ctx_);
 664
 665         for (int i = 0; i < n; i++)
 666         {
 667                 assert(resources[i]->target == PIPE_BUFFER);
 668                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 669
 670                 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
 671         }
 672
 673         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 674         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 675                                 (struct pipe_resource*)pool->bo);
 676 }
 677
 678 /**
 679  * This function initializes all the compute specific registers that need to
 680  * be initialized for each compute command stream.  Registers that are common
 681  * to both compute and 3D will be initialized at the beginning of each compute
 682  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 683  * packet requires that the shader type bit be set, we must initialize all
 684  * context registers needed for compute in this function.  The registers
 685  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 686  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 687  * on the GPU family.
 688  */
 689 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 690 {
 691         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 692         int num_threads;
 693         int num_stack_entries;
 694
 695         /* since all required registers are initialised in the
 696          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 697          */
 698         r600_init_command_buffer(cb, 256);
 699         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 700
 701         /* This must be first. */
 702         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 703         r600_store_value(cb, 0x80000000);
 704         r600_store_value(cb, 0x80000000);
 705
 706         /* We're setting config registers here. */
 707         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 708         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 709
 710         switch (ctx->family) {
 711         case CHIP_CEDAR:
 712         default:
 713                 num_threads = 128;
 714                 num_stack_entries = 256;
 715                 break;
 716         case CHIP_REDWOOD:
 717                 num_threads = 128;
 718                 num_stack_entries = 256;
 719                 break;
 720         case CHIP_JUNIPER:
 721                 num_threads = 128;
 722                 num_stack_entries = 512;
 723                 break;
 724         case CHIP_CYPRESS:
 725         case CHIP_HEMLOCK:
 726                 num_threads = 128;
 727                 num_stack_entries = 512;
 728                 break;
 729         case CHIP_PALM:
 730                 num_threads = 128;
 731                 num_stack_entries = 256;
 732                 break;
 733         case CHIP_SUMO:
 734                 num_threads = 128;
 735                 num_stack_entries = 256;
 736                 break;
 737         case CHIP_SUMO2:
 738                 num_threads = 128;
 739                 num_stack_entries = 512;
 740                 break;
 741         case CHIP_BARTS:
 742                 num_threads = 128;
 743                 num_stack_entries = 512;
 744                 break;
 745         case CHIP_TURKS:
 746                 num_threads = 128;
 747                 num_stack_entries = 256;
 748                 break;
 749         case CHIP_CAICOS:
 750                 num_threads = 128;
 751                 num_stack_entries = 256;
 752                 break;
 753         }
 754
 755         /* Config Registers */
 756         if (ctx->chip_class < CAYMAN)
 757                 evergreen_init_common_regs(cb, ctx->chip_class, ctx->family,
 758                                            ctx->screen->info.drm_minor);
 759         else
 760                 cayman_init_common_regs(cb, ctx->chip_class, ctx->family,
 761                                         ctx->screen->info.drm_minor);
 762
 763         /* The primitive type always needs to be POINTLIST for compute. */
 764         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 765                                                 V_008958_DI_PT_POINTLIST);
 766
 767         if (ctx->chip_class < CAYMAN) {
 768
 769                 /* These registers control which simds can be used by each stage.
 770                  * The default for these registers is 0xffffffff, which means
 771                  * all simds are available for each stage.  It's possible we may
 772                  * want to play around with these in the future, but for now
 773                  * the default value is fine.
 774                  *
 775                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 776                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 777                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 778                  */
 779
 780                 /* XXX: We may need to adjust the thread and stack resouce
 781                  * values for 3D/compute interop */
 782
 783                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 784
 785                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 786                  * Set the number of threads used by the PS/VS/GS/ES stage to
 787                  * 0.
 788                  */
 789                 r600_store_value(cb, 0);
 790
 791                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 792                  * Set the number of threads used by the CS (aka LS) stage to
 793                  * the maximum number of threads and set the number of threads
 794                  * for the HS stage to 0. */
 795                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 796
 797                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 798                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 799                 r600_store_value(cb, 0);
 800
 801                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 802                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 803                 r600_store_value(cb, 0);
 804
 805                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 806                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 807                  * set it to the maximum value for the CS (aka LS) stage. */
 808                 r600_store_value(cb,
 809                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 810         }
 811         /* Give the compute shader all the available LDS space.
 812          * NOTE: This only sets the maximum number of dwords that a compute
 813          * shader can allocate.  When a shader is executed, we still need to
 814          * allocate the appropriate amount of LDS dwords using the
 815          * CM_R_0288E8_SQ_LDS_ALLOC register.
 816          */
 817         if (ctx->chip_class < CAYMAN) {
 818                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 819                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 820         } else {
 821                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 822                         S_0286FC_NUM_PS_LDS(0) |
 823                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 824         }
 825
 826         /* Context Registers */
 827
 828         if (ctx->chip_class < CAYMAN) {
 829                 /* workaround for hw issues with dyn gpr - must set all limits
 830                  * to 240 instead of 0, 0x1e == 240 / 8
 831                  */
 832                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 833                                 S_028838_PS_GPRS(0x1e) |
 834                                 S_028838_VS_GPRS(0x1e) |
 835                                 S_028838_GS_GPRS(0x1e) |
 836                                 S_028838_ES_GPRS(0x1e) |
 837                                 S_028838_HS_GPRS(0x1e) |
 838                                 S_028838_LS_GPRS(0x1e));
 839         }
 840
 841         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 842         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 843                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 844
 845         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 846
 847         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 848                                                 S_0286E8_TID_IN_GROUP_ENA
 849                                                 | S_0286E8_TGID_ENA
 850                                                 | S_0286E8_DISABLE_INDEX_PACK)
 851                                                 ;
 852
 853         /* The LOOP_CONST registers are an optimizations for loops that allows
 854          * you to store the initial counter, increment value, and maximum
 855          * counter value in a register so that hardware can calculate the
 856          * correct number of iterations for the loop, so that you don't need
 857          * to have the loop counter in your shader code.  We don't currently use
 858          * this optimization, so we must keep track of the counter in the
 859          * shader and use a break instruction to exit loops.  However, the
 860          * hardware will still uses this register to determine when to exit a
 861          * loop, so we need to initialize the counter to 0, set the increment
 862          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 863          * is the maximum value allowed.  This gives us a maximum of 4096
 864          * iterations for our loops, but hopefully our break instruction will
 865          * execute before some time before the 4096th iteration.
 866          */
 867         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 868 }
 869
 870 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 871 {
 872         ctx->context.create_compute_state = evergreen_create_compute_state;
 873         ctx->context.delete_compute_state = evergreen_delete_compute_state;
 874         ctx->context.bind_compute_state = evergreen_bind_compute_state;
 875 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 876         ctx->context.set_compute_resources = evergreen_set_compute_resources;
 877         ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
 878         ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
 879         ctx->context.set_global_binding = evergreen_set_global_binding;
 880         ctx->context.launch_grid = evergreen_launch_grid;
 881
 882         /* We always use at least one vertex buffer for parameters (id = 1)*/
 883         ctx->cs_vertex_buffer_state.enabled_mask =
 884         ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
 885 }
 886
 887
 888 struct pipe_resource *r600_compute_global_buffer_create(
 889         struct pipe_screen *screen,
 890         const struct pipe_resource *templ)
 891 {
 892         struct r600_resource_global* result = NULL;
 893         struct r600_screen* rscreen = NULL;
 894         int size_in_dw = 0;
 895
 896         assert(templ->target == PIPE_BUFFER);
 897         assert(templ->bind & PIPE_BIND_GLOBAL);
 898         assert(templ->array_size == 1 || templ->array_size == 0);
 899         assert(templ->depth0 == 1 || templ->depth0 == 0);
 900         assert(templ->height0 == 1 || templ->height0 == 0);
 901
 902         result = (struct r600_resource_global*)
 903         CALLOC(sizeof(struct r600_resource_global), 1);
 904         rscreen = (struct r600_screen*)screen;
 905
 906         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 907         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 908                         templ->array_size);
 909
 910         result->base.b.vtbl = &r600_global_buffer_vtbl;
 911         result->base.b.b.screen = screen;
 912         result->base.b.b = *templ;
 913         pipe_reference_init(&result->base.b.b.reference, 1);
 914
 915         size_in_dw = (templ->width0+3) / 4;
 916
 917         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 918
 919         if (result->chunk == NULL)
 920         {
 921                 free(result);
 922                 return NULL;
 923         }
 924
 925         return &result->base.b.b;
 926 }
 927
 928 void r600_compute_global_buffer_destroy(
 929         struct pipe_screen *screen,
 930         struct pipe_resource *res)
 931 {
 932         struct r600_resource_global* buffer = NULL;
 933         struct r600_screen* rscreen = NULL;
 934
 935         assert(res->target == PIPE_BUFFER);
 936         assert(res->bind & PIPE_BIND_GLOBAL);
 937
 938         buffer = (struct r600_resource_global*)res;
 939         rscreen = (struct r600_screen*)screen;
 940
 941         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 942
 943         buffer->chunk = NULL;
 944         free(res);
 945 }
 946
 947 void *r600_compute_global_transfer_map(
 948         struct pipe_context *ctx_,
 949         struct pipe_resource *resource,
 950         unsigned level,
 951         unsigned usage,
 952         const struct pipe_box *box,
 953         struct pipe_transfer **ptransfer)
 954 {
 955         struct r600_context *rctx = (struct r600_context*)ctx_;
 956         struct compute_memory_pool *pool = rctx->screen->global_pool;
 957         struct r600_resource_global* buffer =
 958                 (struct r600_resource_global*)resource;
 959
 960         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 961                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 962                         "width = %u, height = %u, depth = %u)\n", level, usage,
 963                         box->x, box->y, box->z, box->width, box->height,
 964                         box->depth);
 965         COMPUTE_DBG(rctx->screen, "Buffer: %u (buffer offset in global memory) "
 966                 "+ %u (box.x)\n", buffer->chunk->start_in_dw, box->x);
 967
 968
 969         compute_memory_finalize_pending(pool, ctx_);
 970
 971         assert(resource->target == PIPE_BUFFER);
 972         assert(resource->bind & PIPE_BIND_GLOBAL);
 973         assert(box->x >= 0);
 974         assert(box->y == 0);
 975         assert(box->z == 0);
 976
 977         ///TODO: do it better, mapping is not possible if the pool is too big
 978         return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
 979                         box->x + (buffer->chunk->start_in_dw * 4),
 980                         box->width, usage, ptransfer);
 981 }
 982
 983 void r600_compute_global_transfer_unmap(
 984         struct pipe_context *ctx_,
 985         struct pipe_transfer* transfer)
 986 {
 987         /* struct r600_resource_global are not real resources, they just map
 988          * to an offset within the compute memory pool.  The function
 989          * r600_compute_global_transfer_map() maps the memory pool
 990          * resource rather than the struct r600_resource_global passed to
 991          * it as an argument and then initalizes ptransfer->resource with
 992          * the memory pool resource (via pipe_buffer_map_range).
 993          * When transfer_unmap is called it uses the memory pool's
 994          * vtable which calls r600_buffer_transfer_map() rather than
 995          * this function.
 996          */
 997         assert (!"This function should not be called");
 998 }
 999
1000 void r600_compute_global_transfer_flush_region(
1001         struct pipe_context *ctx_,
1002         struct pipe_transfer *transfer,
1003         const struct pipe_box *box)
1004 {
1005         assert(0 && "TODO");
1006 }
1007
1008 void r600_compute_global_transfer_inline_write(
1009         struct pipe_context *pipe,
1010         struct pipe_resource *resource,
1011         unsigned level,
1012         unsigned usage,
1013         const struct pipe_box *box,
1014         const void *data,
1015         unsigned stride,
1016         unsigned layer_stride)
1017 {
1018         assert(0 && "TODO");
1019 }