src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_resource.h"
  43 #include "r600_shader.h"
  44 #include "r600_pipe.h"
  45 #include "r600_formats.h"
  46 #include "evergreen_compute.h"
  47 #include "evergreen_compute_internal.h"
  48 #include "compute_memory_pool.h"
  49 #include "sb/sb_public.h"
  50 #ifdef HAVE_OPENCL
  51 #include "radeon_llvm_util.h"
  52 #endif
  53
  54 /**
  55 RAT0 is for global binding write
  56 VTX1 is for global binding read
  57
  58 for wrting images RAT1...
  59 for reading images TEX2...
  60   TEX2-RAT1 is paired
  61
  62 TEX2... consumes the same fetch resources, that VTX2... would consume
  63
  64 CONST0 and VTX0 is for parameters
  65   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  66   also constant cached
  67   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  68   the constant cache can handle
  69
  70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  72 we should reserve another one too.=> 10 image binding for writing max.
  73
  74 from Nvidia OpenCL:
  75   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  76   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  77
  78 so 10 for writing is enough. 176 is the max for reading according to the docs
  79
  80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  81 writable images will consume TEX slots, VTX slots too because of linear indexing
  82
  83 */
  84
  85 struct r600_resource* r600_compute_buffer_alloc_vram(
  86        struct r600_screen *screen,
  87        unsigned size)
  88 {
  89         struct pipe_resource * buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create(
  93                 (struct pipe_screen*) screen,
  94                 PIPE_BIND_CUSTOM,
  95                 PIPE_USAGE_IMMUTABLE,
  96                 size);
  97
  98         return (struct r600_resource *)buffer;
  99 }
 100
 101
 102 static void evergreen_set_rat(
 103         struct r600_pipe_compute *pipe,
 104         int id,
 105         struct r600_resource* bo,
 106         int start,
 107         int size)
 108 {
 109         struct pipe_surface rat_templ;
 110         struct r600_surface *surf = NULL;
 111         struct r600_context *rctx = NULL;
 112
 113         assert(id < 12);
 114         assert((size & 3) == 0);
 115         assert((start & 0xFF) == 0);
 116
 117         rctx = pipe->ctx;
 118
 119         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 120
 121         /* Create the RAT surface */
 122         memset(&rat_templ, 0, sizeof(rat_templ));
 123         rat_templ.format = PIPE_FORMAT_R32_UINT;
 124         rat_templ.u.tex.level = 0;
 125         rat_templ.u.tex.first_layer = 0;
 126         rat_templ.u.tex.last_layer = 0;
 127
 128         /* Add the RAT the list of color buffers */
 129         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 130                 (struct pipe_context *)pipe->ctx,
 131                 (struct pipe_resource *)bo, &rat_templ);
 132
 133         /* Update the number of color buffers */
 134         pipe->ctx->framebuffer.state.nr_cbufs =
 135                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 136
 137         /* Update the cb_target_mask
 138          * XXX: I think this is a potential spot for bugs once we start doing
 139          * GL interop.  cb_target_mask may be modified in the 3D sections
 140          * of this driver. */
 141         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 142
 143         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 144         evergreen_init_color_surface_rat(rctx, surf);
 145 }
 146
 147 static void evergreen_cs_set_vertex_buffer(
 148         struct r600_context * rctx,
 149         unsigned vb_index,
 150         unsigned offset,
 151         struct pipe_resource * buffer)
 152 {
 153         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 154         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 155         vb->stride = 1;
 156         vb->buffer_offset = offset;
 157         vb->buffer = buffer;
 158         vb->user_buffer = NULL;
 159
 160         /* The vertex instructions in the compute shaders use the texture cache,
 161          * so we need to invalidate it. */
 162         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 163         state->enabled_mask |= 1 << vb_index;
 164         state->dirty_mask |= 1 << vb_index;
 165         state->atom.dirty = true;
 166 }
 167
 168 static void evergreen_cs_set_constant_buffer(
 169         struct r600_context * rctx,
 170         unsigned cb_index,
 171         unsigned offset,
 172         unsigned size,
 173         struct pipe_resource * buffer)
 174 {
 175         struct pipe_constant_buffer cb;
 176         cb.buffer_size = size;
 177         cb.buffer_offset = offset;
 178         cb.buffer = buffer;
 179         cb.user_buffer = NULL;
 180
 181         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 182 }
 183
 184 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 185 {
 186         u_default_resource_get_handle, /* get_handle */
 187         r600_compute_global_buffer_destroy, /* resource_destroy */
 188         r600_compute_global_transfer_map, /* transfer_map */
 189         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 190         r600_compute_global_transfer_unmap, /* transfer_unmap */
 191         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 192 };
 193
 194
 195 void *evergreen_create_compute_state(
 196         struct pipe_context *ctx_,
 197         const const struct pipe_compute_state *cso)
 198 {
 199         struct r600_context *ctx = (struct r600_context *)ctx_;
 200         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 201
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const unsigned char * code;
 205         unsigned i;
 206
 207         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 208
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211 #endif
 212
 213         shader->ctx = (struct r600_context*)ctx;
 214         shader->local_size = cso->req_local_mem;
 215         shader->private_size = cso->req_private_mem;
 216         shader->input_size = cso->req_input_mem;
 217
 218 #ifdef HAVE_OPENCL
 219         shader->num_kernels = radeon_llvm_get_num_kernels(code, header->num_bytes);
 220         shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
 221
 222         for (i = 0; i < shader->num_kernels; i++) {
 223                 struct r600_kernel *kernel = &shader->kernels[i];
 224                 kernel->llvm_module = radeon_llvm_get_kernel_module(i, code,
 225                                                         header->num_bytes);
 226         }
 227 #endif
 228         return shader;
 229 }
 230
 231 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 232 {
 233         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 234
 235         free(shader);
 236 }
 237
 238 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 239 {
 240         struct r600_context *ctx = (struct r600_context *)ctx_;
 241
 242         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 243
 244         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 245 }
 246
 247 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 248  * kernel parameters there are inplicit parameters that need to be stored
 249  * in the vertex buffer as well.  Here is how these parameters are organized in
 250  * the buffer:
 251  *
 252  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 253  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 254  * DWORDS 6-8: Number of work items within each work group in each dimension
 255  *             (x,y,z)
 256  * DWORDS 9+ : Kernel parameters
 257  */
 258 void evergreen_compute_upload_input(
 259         struct pipe_context *ctx_,
 260         const uint *block_layout,
 261         const uint *grid_layout,
 262         const void *input)
 263 {
 264         struct r600_context *ctx = (struct r600_context *)ctx_;
 265         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 266         int i;
 267         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 268          * parameters.
 269          */
 270         unsigned input_size = shader->input_size + 36;
 271         uint32_t * num_work_groups_start;
 272         uint32_t * global_size_start;
 273         uint32_t * local_size_start;
 274         uint32_t * kernel_parameters_start;
 275         struct pipe_box box;
 276         struct pipe_transfer *transfer = NULL;
 277
 278         if (shader->input_size == 0) {
 279                 return;
 280         }
 281
 282         if (!shader->kernel_param) {
 283                 /* Add space for the grid dimensions */
 284                 shader->kernel_param = (struct r600_resource *)
 285                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 286                                         PIPE_USAGE_IMMUTABLE, input_size);
 287         }
 288
 289         u_box_1d(0, input_size, &box);
 290         num_work_groups_start = ctx_->transfer_map(ctx_,
 291                         (struct pipe_resource*)shader->kernel_param,
 292                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 293                         &box, &transfer);
 294         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 295         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 296         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 297
 298         /* Copy the work group size */
 299         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 300
 301         /* Copy the global size */
 302         for (i = 0; i < 3; i++) {
 303                 global_size_start[i] = grid_layout[i] * block_layout[i];
 304         }
 305
 306         /* Copy the local dimensions */
 307         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 308
 309         /* Copy the kernel inputs */
 310         memcpy(kernel_parameters_start, input, shader->input_size);
 311
 312         for (i = 0; i < (input_size / 4); i++) {
 313                 COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
 314                         ((unsigned*)num_work_groups_start)[i]);
 315         }
 316
 317         ctx_->transfer_unmap(ctx_, transfer);
 318
 319         /* ID=0 is reserved for the parameters */
 320         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 321                         (struct pipe_resource*)shader->kernel_param);
 322 }
 323
 324 static void evergreen_emit_direct_dispatch(
 325                 struct r600_context *rctx,
 326                 const uint *block_layout, const uint *grid_layout)
 327 {
 328         int i;
 329         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 330         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 331         unsigned num_waves;
 332         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 333         unsigned wave_divisor = (16 * num_pipes);
 334         int group_size = 1;
 335         int grid_size = 1;
 336         unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
 337
 338         /* Calculate group_size/grid_size */
 339         for (i = 0; i < 3; i++) {
 340                 group_size *= block_layout[i];
 341         }
 342
 343         for (i = 0; i < 3; i++) {
 344                 grid_size *= grid_layout[i];
 345         }
 346
 347         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 348         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 349                         wave_divisor - 1) / wave_divisor;
 350
 351         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 352                                 "%u wavefronts per thread block, "
 353                                 "allocating %u dwords lds.\n",
 354                                 num_pipes, num_waves, lds_size);
 355
 356         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 357
 358         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 359         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 360         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 361         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 362
 363         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 364                                                                 group_size);
 365
 366         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 367         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 368         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 369         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 370
 371         if (rctx->b.chip_class < CAYMAN) {
 372                 assert(lds_size <= 8192);
 373         } else {
 374                 /* Cayman appears to have a slightly smaller limit, see the
 375                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 376                 assert(lds_size <= 8160);
 377         }
 378
 379         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 380                                         lds_size | (num_waves << 14));
 381
 382         /* Dispatch packet */
 383         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 384         radeon_emit(cs, grid_layout[0]);
 385         radeon_emit(cs, grid_layout[1]);
 386         radeon_emit(cs, grid_layout[2]);
 387         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 388         radeon_emit(cs, 1);
 389 }
 390
 391 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 392                 const uint *grid_layout)
 393 {
 394         struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 395         unsigned flush_flags = 0;
 396         int i;
 397
 398         /* make sure that the gfx ring is only one active */
 399         if (ctx->b.rings.dma.cs) {
 400                 ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
 401         }
 402
 403         /* Initialize all the compute-related registers.
 404          *
 405          * See evergreen_init_atom_start_compute_cs() in this file for the list
 406          * of registers initialized by the start_compute_cs_cmd atom.
 407          */
 408         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 409
 410         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 411         r600_flush_emit(ctx);
 412
 413         /* Emit colorbuffers. */
 414         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 415         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 416                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 417                 unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
 418                                                        (struct r600_resource*)cb->base.texture,
 419                                                        RADEON_USAGE_READWRITE);
 420
 421                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 422                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 423                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 424                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 425                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 426                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 427                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 428                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 429
 430                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 431                 radeon_emit(cs, reloc);
 432
 433                 if (!ctx->keep_tiling_flags) {
 434                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 435                         radeon_emit(cs, reloc);
 436                 }
 437
 438                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 439                 radeon_emit(cs, reloc);
 440         }
 441         if (ctx->keep_tiling_flags) {
 442                 for (; i < 8 ; i++) {
 443                         r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 444                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 445                 }
 446                 for (; i < 12; i++) {
 447                         r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 448                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 449                 }
 450         }
 451
 452         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 453         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 454                                         ctx->compute_cb_target_mask);
 455
 456
 457         /* Emit vertex buffer state */
 458         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 459         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 460
 461         /* Emit constant buffer state */
 462         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 463
 464         /* Emit compute shader state */
 465         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 466
 467         /* Emit dispatch state and dispatch packet */
 468         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 469
 470         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 471          */
 472         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 473                       R600_CONTEXT_INV_VERTEX_CACHE |
 474                       R600_CONTEXT_INV_TEX_CACHE;
 475         r600_flush_emit(ctx);
 476
 477 #if 0
 478         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 479         for (i = 0; i < cs->cdw; i++) {
 480                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 481         }
 482 #endif
 483
 484         flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
 485         if (ctx->keep_tiling_flags) {
 486                 flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 487         }
 488
 489         ctx->b.ws->cs_flush(ctx->b.rings.gfx.cs, flush_flags, ctx->screen->cs_count++);
 490
 491         ctx->b.flags = 0;
 492
 493         COMPUTE_DBG(ctx->screen, "shader started\n");
 494 }
 495
 496
 497 /**
 498  * Emit function for r600_cs_shader_state atom
 499  */
 500 void evergreen_emit_cs_shader(
 501                 struct r600_context *rctx,
 502                 struct r600_atom *atom)
 503 {
 504         struct r600_cs_shader_state *state =
 505                                         (struct r600_cs_shader_state*)atom;
 506         struct r600_pipe_compute *shader = state->shader;
 507         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 508         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 509         uint64_t va;
 510
 511         va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b);
 512
 513         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 514         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 515         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 516                         S_0288D4_NUM_GPRS(kernel->bc.ngpr)
 517                         | S_0288D4_STACK_SIZE(kernel->bc.nstack));
 518         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 519
 520         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 521         radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
 522                                                         kernel->code_bo, RADEON_USAGE_READ));
 523 }
 524
 525 static void evergreen_launch_grid(
 526                 struct pipe_context *ctx_,
 527                 const uint *block_layout, const uint *grid_layout,
 528                 uint32_t pc, const void *input)
 529 {
 530         struct r600_context *ctx = (struct r600_context *)ctx_;
 531
 532         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 533         struct r600_kernel *kernel = &shader->kernels[pc];
 534
 535         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 536
 537 #ifdef HAVE_OPENCL
 538
 539         if (!kernel->code_bo) {
 540                 void *p;
 541                 struct r600_bytecode *bc = &kernel->bc;
 542                 LLVMModuleRef mod = kernel->llvm_module;
 543                 boolean use_kill = false;
 544                 bool dump = (ctx->screen->debug_flags & DBG_CS) != 0;
 545                 unsigned use_sb = ctx->screen->debug_flags & DBG_SB_CS;
 546                 unsigned sb_disasm = use_sb ||
 547                         (ctx->screen->debug_flags & DBG_SB_DISASM);
 548
 549                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 550                            ctx->screen->has_compressed_msaa_texturing);
 551                 bc->type = TGSI_PROCESSOR_COMPUTE;
 552                 bc->isa = ctx->isa;
 553                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 554
 555                 if (dump && !sb_disasm) {
 556                         r600_bytecode_disasm(bc);
 557                 } else if ((dump && sb_disasm) || use_sb) {
 558                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 559                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 560                 }
 561
 562                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 563                                                         kernel->bc.ndw * 4);
 564                 p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
 565                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 566                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 567         }
 568 #endif
 569         shader->active_kernel = kernel;
 570         ctx->cs_shader_state.kernel_index = pc;
 571         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 572         compute_emit_cs(ctx, block_layout, grid_layout);
 573 }
 574
 575 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 576                 unsigned start, unsigned count,
 577                 struct pipe_surface ** surfaces)
 578 {
 579         struct r600_context *ctx = (struct r600_context *)ctx_;
 580         struct r600_surface **resources = (struct r600_surface **)surfaces;
 581
 582         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 583                         start, count);
 584
 585         for (int i = 0; i < count; i++) {
 586                 /* The First two vertex buffers are reserved for parameters and
 587                  * global buffers. */
 588                 unsigned vtx_id = 2 + i;
 589                 if (resources[i]) {
 590                         struct r600_resource_global *buffer =
 591                                 (struct r600_resource_global*)
 592                                 resources[i]->base.texture;
 593                         if (resources[i]->base.writable) {
 594                                 assert(i+1 < 12);
 595
 596                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 597                                 (struct r600_resource *)resources[i]->base.texture,
 598                                 buffer->chunk->start_in_dw*4,
 599                                 resources[i]->base.texture->width0);
 600                         }
 601
 602                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 603                                         buffer->chunk->start_in_dw * 4,
 604                                         resources[i]->base.texture);
 605                 }
 606         }
 607 }
 608
 609 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 610                 unsigned start_slot, unsigned count,
 611                 struct pipe_sampler_view **views)
 612 {
 613         struct r600_pipe_sampler_view **resource =
 614                 (struct r600_pipe_sampler_view **)views;
 615
 616         for (int i = 0; i < count; i++) {
 617                 if (resource[i]) {
 618                         assert(i+1 < 12);
 619                         /* XXX: Implement */
 620                         assert(!"Compute samplers not implemented.");
 621                         ///FETCH0 = VTX0 (param buffer),
 622                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 623                 }
 624         }
 625 }
 626
 627 static void evergreen_bind_compute_sampler_states(
 628         struct pipe_context *ctx_,
 629         unsigned start_slot,
 630         unsigned num_samplers,
 631         void **samplers_)
 632 {
 633         struct compute_sampler_state ** samplers =
 634                 (struct compute_sampler_state **)samplers_;
 635
 636         for (int i = 0; i < num_samplers; i++) {
 637                 if (samplers[i]) {
 638                         /* XXX: Implement */
 639                         assert(!"Compute samplers not implemented.");
 640                 }
 641         }
 642 }
 643
 644 static void evergreen_set_global_binding(
 645         struct pipe_context *ctx_, unsigned first, unsigned n,
 646         struct pipe_resource **resources,
 647         uint32_t **handles)
 648 {
 649         struct r600_context *ctx = (struct r600_context *)ctx_;
 650         struct compute_memory_pool *pool = ctx->screen->global_pool;
 651         struct r600_resource_global **buffers =
 652                 (struct r600_resource_global **)resources;
 653
 654         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 655                         first, n);
 656
 657         if (!resources) {
 658                 /* XXX: Unset */
 659                 return;
 660         }
 661
 662         compute_memory_finalize_pending(pool, ctx_);
 663
 664         for (int i = 0; i < n; i++)
 665         {
 666                 assert(resources[i]->target == PIPE_BUFFER);
 667                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 668
 669                 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
 670         }
 671
 672         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 673         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 674                                 (struct pipe_resource*)pool->bo);
 675 }
 676
 677 /**
 678  * This function initializes all the compute specific registers that need to
 679  * be initialized for each compute command stream.  Registers that are common
 680  * to both compute and 3D will be initialized at the beginning of each compute
 681  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 682  * packet requires that the shader type bit be set, we must initialize all
 683  * context registers needed for compute in this function.  The registers
 684  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 685  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 686  * on the GPU family.
 687  */
 688 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 689 {
 690         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 691         int num_threads;
 692         int num_stack_entries;
 693
 694         /* since all required registers are initialised in the
 695          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 696          */
 697         r600_init_command_buffer(cb, 256);
 698         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 699
 700         /* This must be first. */
 701         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 702         r600_store_value(cb, 0x80000000);
 703         r600_store_value(cb, 0x80000000);
 704
 705         /* We're setting config registers here. */
 706         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 707         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 708
 709         switch (ctx->b.family) {
 710         case CHIP_CEDAR:
 711         default:
 712                 num_threads = 128;
 713                 num_stack_entries = 256;
 714                 break;
 715         case CHIP_REDWOOD:
 716                 num_threads = 128;
 717                 num_stack_entries = 256;
 718                 break;
 719         case CHIP_JUNIPER:
 720                 num_threads = 128;
 721                 num_stack_entries = 512;
 722                 break;
 723         case CHIP_CYPRESS:
 724         case CHIP_HEMLOCK:
 725                 num_threads = 128;
 726                 num_stack_entries = 512;
 727                 break;
 728         case CHIP_PALM:
 729                 num_threads = 128;
 730                 num_stack_entries = 256;
 731                 break;
 732         case CHIP_SUMO:
 733                 num_threads = 128;
 734                 num_stack_entries = 256;
 735                 break;
 736         case CHIP_SUMO2:
 737                 num_threads = 128;
 738                 num_stack_entries = 512;
 739                 break;
 740         case CHIP_BARTS:
 741                 num_threads = 128;
 742                 num_stack_entries = 512;
 743                 break;
 744         case CHIP_TURKS:
 745                 num_threads = 128;
 746                 num_stack_entries = 256;
 747                 break;
 748         case CHIP_CAICOS:
 749                 num_threads = 128;
 750                 num_stack_entries = 256;
 751                 break;
 752         }
 753
 754         /* Config Registers */
 755         if (ctx->b.chip_class < CAYMAN)
 756                 evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 757                                            ctx->screen->b.info.drm_minor);
 758         else
 759                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 760                                         ctx->screen->b.info.drm_minor);
 761
 762         /* The primitive type always needs to be POINTLIST for compute. */
 763         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 764                                                 V_008958_DI_PT_POINTLIST);
 765
 766         if (ctx->b.chip_class < CAYMAN) {
 767
 768                 /* These registers control which simds can be used by each stage.
 769                  * The default for these registers is 0xffffffff, which means
 770                  * all simds are available for each stage.  It's possible we may
 771                  * want to play around with these in the future, but for now
 772                  * the default value is fine.
 773                  *
 774                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 775                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 776                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 777                  */
 778
 779                 /* XXX: We may need to adjust the thread and stack resouce
 780                  * values for 3D/compute interop */
 781
 782                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 783
 784                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 785                  * Set the number of threads used by the PS/VS/GS/ES stage to
 786                  * 0.
 787                  */
 788                 r600_store_value(cb, 0);
 789
 790                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 791                  * Set the number of threads used by the CS (aka LS) stage to
 792                  * the maximum number of threads and set the number of threads
 793                  * for the HS stage to 0. */
 794                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 795
 796                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 797                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 798                 r600_store_value(cb, 0);
 799
 800                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 801                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 802                 r600_store_value(cb, 0);
 803
 804                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 805                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 806                  * set it to the maximum value for the CS (aka LS) stage. */
 807                 r600_store_value(cb,
 808                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 809         }
 810         /* Give the compute shader all the available LDS space.
 811          * NOTE: This only sets the maximum number of dwords that a compute
 812          * shader can allocate.  When a shader is executed, we still need to
 813          * allocate the appropriate amount of LDS dwords using the
 814          * CM_R_0288E8_SQ_LDS_ALLOC register.
 815          */
 816         if (ctx->b.chip_class < CAYMAN) {
 817                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 818                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 819         } else {
 820                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 821                         S_0286FC_NUM_PS_LDS(0) |
 822                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 823         }
 824
 825         /* Context Registers */
 826
 827         if (ctx->b.chip_class < CAYMAN) {
 828                 /* workaround for hw issues with dyn gpr - must set all limits
 829                  * to 240 instead of 0, 0x1e == 240 / 8
 830                  */
 831                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 832                                 S_028838_PS_GPRS(0x1e) |
 833                                 S_028838_VS_GPRS(0x1e) |
 834                                 S_028838_GS_GPRS(0x1e) |
 835                                 S_028838_ES_GPRS(0x1e) |
 836                                 S_028838_HS_GPRS(0x1e) |
 837                                 S_028838_LS_GPRS(0x1e));
 838         }
 839
 840         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 841         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 842                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 843
 844         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 845
 846         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 847                                                 S_0286E8_TID_IN_GROUP_ENA
 848                                                 | S_0286E8_TGID_ENA
 849                                                 | S_0286E8_DISABLE_INDEX_PACK)
 850                                                 ;
 851
 852         /* The LOOP_CONST registers are an optimizations for loops that allows
 853          * you to store the initial counter, increment value, and maximum
 854          * counter value in a register so that hardware can calculate the
 855          * correct number of iterations for the loop, so that you don't need
 856          * to have the loop counter in your shader code.  We don't currently use
 857          * this optimization, so we must keep track of the counter in the
 858          * shader and use a break instruction to exit loops.  However, the
 859          * hardware will still uses this register to determine when to exit a
 860          * loop, so we need to initialize the counter to 0, set the increment
 861          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 862          * is the maximum value allowed.  This gives us a maximum of 4096
 863          * iterations for our loops, but hopefully our break instruction will
 864          * execute before some time before the 4096th iteration.
 865          */
 866         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 867 }
 868
 869 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 870 {
 871         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 872         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 873         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 874 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 875         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 876         ctx->b.b.set_compute_sampler_views = evergreen_set_cs_sampler_view;
 877         ctx->b.b.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
 878         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 879         ctx->b.b.launch_grid = evergreen_launch_grid;
 880
 881         /* We always use at least one vertex buffer for parameters (id = 1)*/
 882         ctx->cs_vertex_buffer_state.enabled_mask =
 883         ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
 884 }
 885
 886
 887 struct pipe_resource *r600_compute_global_buffer_create(
 888         struct pipe_screen *screen,
 889         const struct pipe_resource *templ)
 890 {
 891         struct r600_resource_global* result = NULL;
 892         struct r600_screen* rscreen = NULL;
 893         int size_in_dw = 0;
 894
 895         assert(templ->target == PIPE_BUFFER);
 896         assert(templ->bind & PIPE_BIND_GLOBAL);
 897         assert(templ->array_size == 1 || templ->array_size == 0);
 898         assert(templ->depth0 == 1 || templ->depth0 == 0);
 899         assert(templ->height0 == 1 || templ->height0 == 0);
 900
 901         result = (struct r600_resource_global*)
 902         CALLOC(sizeof(struct r600_resource_global), 1);
 903         rscreen = (struct r600_screen*)screen;
 904
 905         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 906         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 907                         templ->array_size);
 908
 909         result->base.b.vtbl = &r600_global_buffer_vtbl;
 910         result->base.b.b.screen = screen;
 911         result->base.b.b = *templ;
 912         pipe_reference_init(&result->base.b.b.reference, 1);
 913
 914         size_in_dw = (templ->width0+3) / 4;
 915
 916         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 917
 918         if (result->chunk == NULL)
 919         {
 920                 free(result);
 921                 return NULL;
 922         }
 923
 924         return &result->base.b.b;
 925 }
 926
 927 void r600_compute_global_buffer_destroy(
 928         struct pipe_screen *screen,
 929         struct pipe_resource *res)
 930 {
 931         struct r600_resource_global* buffer = NULL;
 932         struct r600_screen* rscreen = NULL;
 933
 934         assert(res->target == PIPE_BUFFER);
 935         assert(res->bind & PIPE_BIND_GLOBAL);
 936
 937         buffer = (struct r600_resource_global*)res;
 938         rscreen = (struct r600_screen*)screen;
 939
 940         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 941
 942         buffer->chunk = NULL;
 943         free(res);
 944 }
 945
 946 void *r600_compute_global_transfer_map(
 947         struct pipe_context *ctx_,
 948         struct pipe_resource *resource,
 949         unsigned level,
 950         unsigned usage,
 951         const struct pipe_box *box,
 952         struct pipe_transfer **ptransfer)
 953 {
 954         struct r600_context *rctx = (struct r600_context*)ctx_;
 955         struct compute_memory_pool *pool = rctx->screen->global_pool;
 956         struct r600_resource_global* buffer =
 957                 (struct r600_resource_global*)resource;
 958
 959         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 960                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 961                         "width = %u, height = %u, depth = %u)\n", level, usage,
 962                         box->x, box->y, box->z, box->width, box->height,
 963                         box->depth);
 964         COMPUTE_DBG(rctx->screen, "Buffer: %u (buffer offset in global memory) "
 965                 "+ %u (box.x)\n", buffer->chunk->start_in_dw, box->x);
 966
 967
 968         compute_memory_finalize_pending(pool, ctx_);
 969
 970         assert(resource->target == PIPE_BUFFER);
 971         assert(resource->bind & PIPE_BIND_GLOBAL);
 972         assert(box->x >= 0);
 973         assert(box->y == 0);
 974         assert(box->z == 0);
 975
 976         ///TODO: do it better, mapping is not possible if the pool is too big
 977         return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
 978                         box->x + (buffer->chunk->start_in_dw * 4),
 979                         box->width, usage, ptransfer);
 980 }
 981
 982 void r600_compute_global_transfer_unmap(
 983         struct pipe_context *ctx_,
 984         struct pipe_transfer* transfer)
 985 {
 986         /* struct r600_resource_global are not real resources, they just map
 987          * to an offset within the compute memory pool.  The function
 988          * r600_compute_global_transfer_map() maps the memory pool
 989          * resource rather than the struct r600_resource_global passed to
 990          * it as an argument and then initalizes ptransfer->resource with
 991          * the memory pool resource (via pipe_buffer_map_range).
 992          * When transfer_unmap is called it uses the memory pool's
 993          * vtable which calls r600_buffer_transfer_map() rather than
 994          * this function.
 995          */
 996         assert (!"This function should not be called");
 997 }
 998
 999 void r600_compute_global_transfer_flush_region(
1000         struct pipe_context *ctx_,
1001         struct pipe_transfer *transfer,
1002         const struct pipe_box *box)
1003 {
1004         assert(0 && "TODO");
1005 }
1006
1007 void r600_compute_global_transfer_inline_write(
1008         struct pipe_context *pipe,
1009         struct pipe_resource *resource,
1010         unsigned level,
1011         unsigned usage,
1012         const struct pipe_box *box,
1013         const void *data,
1014         unsigned stride,
1015         unsigned layer_stride)
1016 {
1017         assert(0 && "TODO");
1018 }