src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_resource.h"
  43 #include "r600_shader.h"
  44 #include "r600_pipe.h"
  45 #include "r600_formats.h"
  46 #include "evergreen_compute.h"
  47 #include "evergreen_compute_internal.h"
  48 #include "compute_memory_pool.h"
  49 #include "sb/sb_public.h"
  50 #ifdef HAVE_OPENCL
  51 #include "radeon_llvm_util.h"
  52 #endif
  53
  54 /**
  55 RAT0 is for global binding write
  56 VTX1 is for global binding read
  57
  58 for wrting images RAT1...
  59 for reading images TEX2...
  60   TEX2-RAT1 is paired
  61
  62 TEX2... consumes the same fetch resources, that VTX2... would consume
  63
  64 CONST0 and VTX0 is for parameters
  65   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  66   also constant cached
  67   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  68   the constant cache can handle
  69
  70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  72 we should reserve another one too.=> 10 image binding for writing max.
  73
  74 from Nvidia OpenCL:
  75   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  76   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  77
  78 so 10 for writing is enough. 176 is the max for reading according to the docs
  79
  80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  81 writable images will consume TEX slots, VTX slots too because of linear indexing
  82
  83 */
  84
  85 struct r600_resource* r600_compute_buffer_alloc_vram(
  86        struct r600_screen *screen,
  87        unsigned size)
  88 {
  89         struct pipe_resource * buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create(
  93                 (struct pipe_screen*) screen,
  94                 PIPE_BIND_CUSTOM,
  95                 PIPE_USAGE_IMMUTABLE,
  96                 size);
  97
  98         return (struct r600_resource *)buffer;
  99 }
 100
 101
 102 static void evergreen_set_rat(
 103         struct r600_pipe_compute *pipe,
 104         int id,
 105         struct r600_resource* bo,
 106         int start,
 107         int size)
 108 {
 109         struct pipe_surface rat_templ;
 110         struct r600_surface *surf = NULL;
 111         struct r600_context *rctx = NULL;
 112
 113         assert(id < 12);
 114         assert((size & 3) == 0);
 115         assert((start & 0xFF) == 0);
 116
 117         rctx = pipe->ctx;
 118
 119         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 120
 121         /* Create the RAT surface */
 122         memset(&rat_templ, 0, sizeof(rat_templ));
 123         rat_templ.format = PIPE_FORMAT_R32_UINT;
 124         rat_templ.u.tex.level = 0;
 125         rat_templ.u.tex.first_layer = 0;
 126         rat_templ.u.tex.last_layer = 0;
 127
 128         /* Add the RAT the list of color buffers */
 129         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 130                 (struct pipe_context *)pipe->ctx,
 131                 (struct pipe_resource *)bo, &rat_templ);
 132
 133         /* Update the number of color buffers */
 134         pipe->ctx->framebuffer.state.nr_cbufs =
 135                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 136
 137         /* Update the cb_target_mask
 138          * XXX: I think this is a potential spot for bugs once we start doing
 139          * GL interop.  cb_target_mask may be modified in the 3D sections
 140          * of this driver. */
 141         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 142
 143         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 144         evergreen_init_color_surface_rat(rctx, surf);
 145 }
 146
 147 static void evergreen_cs_set_vertex_buffer(
 148         struct r600_context * rctx,
 149         unsigned vb_index,
 150         unsigned offset,
 151         struct pipe_resource * buffer)
 152 {
 153         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 154         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 155         vb->stride = 1;
 156         vb->buffer_offset = offset;
 157         vb->buffer = buffer;
 158         vb->user_buffer = NULL;
 159
 160         /* The vertex instructions in the compute shaders use the texture cache,
 161          * so we need to invalidate it. */
 162         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 163         state->enabled_mask |= 1 << vb_index;
 164         state->dirty_mask |= 1 << vb_index;
 165         state->atom.dirty = true;
 166 }
 167
 168 static void evergreen_cs_set_constant_buffer(
 169         struct r600_context * rctx,
 170         unsigned cb_index,
 171         unsigned offset,
 172         unsigned size,
 173         struct pipe_resource * buffer)
 174 {
 175         struct pipe_constant_buffer cb;
 176         cb.buffer_size = size;
 177         cb.buffer_offset = offset;
 178         cb.buffer = buffer;
 179         cb.user_buffer = NULL;
 180
 181         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 182 }
 183
 184 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 185 {
 186         u_default_resource_get_handle, /* get_handle */
 187         r600_compute_global_buffer_destroy, /* resource_destroy */
 188         r600_compute_global_transfer_map, /* transfer_map */
 189         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 190         r600_compute_global_transfer_unmap, /* transfer_unmap */
 191         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 192 };
 193
 194
 195 void *evergreen_create_compute_state(
 196         struct pipe_context *ctx_,
 197         const const struct pipe_compute_state *cso)
 198 {
 199         struct r600_context *ctx = (struct r600_context *)ctx_;
 200         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 201
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const unsigned char * code;
 205         unsigned i;
 206
 207         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 208
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211 #endif
 212
 213         shader->ctx = (struct r600_context*)ctx;
 214         shader->local_size = cso->req_local_mem;
 215         shader->private_size = cso->req_private_mem;
 216         shader->input_size = cso->req_input_mem;
 217
 218 #ifdef HAVE_OPENCL
 219         shader->num_kernels = radeon_llvm_get_num_kernels(code, header->num_bytes);
 220         shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
 221
 222         for (i = 0; i < shader->num_kernels; i++) {
 223                 struct r600_kernel *kernel = &shader->kernels[i];
 224                 kernel->llvm_module = radeon_llvm_get_kernel_module(i, code,
 225                                                         header->num_bytes);
 226         }
 227 #endif
 228         return shader;
 229 }
 230
 231 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 232 {
 233         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 234
 235         free(shader);
 236 }
 237
 238 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 239 {
 240         struct r600_context *ctx = (struct r600_context *)ctx_;
 241
 242         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 243
 244         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 245 }
 246
 247 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 248  * kernel parameters there are inplicit parameters that need to be stored
 249  * in the vertex buffer as well.  Here is how these parameters are organized in
 250  * the buffer:
 251  *
 252  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 253  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 254  * DWORDS 6-8: Number of work items within each work group in each dimension
 255  *             (x,y,z)
 256  * DWORDS 9+ : Kernel parameters
 257  */
 258 void evergreen_compute_upload_input(
 259         struct pipe_context *ctx_,
 260         const uint *block_layout,
 261         const uint *grid_layout,
 262         const void *input)
 263 {
 264         struct r600_context *ctx = (struct r600_context *)ctx_;
 265         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 266         int i;
 267         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 268          * parameters.
 269          */
 270         unsigned input_size = shader->input_size + 36;
 271         uint32_t * num_work_groups_start;
 272         uint32_t * global_size_start;
 273         uint32_t * local_size_start;
 274         uint32_t * kernel_parameters_start;
 275         struct pipe_box box;
 276         struct pipe_transfer *transfer = NULL;
 277
 278         if (shader->input_size == 0) {
 279                 return;
 280         }
 281
 282         if (!shader->kernel_param) {
 283                 /* Add space for the grid dimensions */
 284                 shader->kernel_param = (struct r600_resource *)
 285                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 286                                         PIPE_USAGE_IMMUTABLE, input_size);
 287         }
 288
 289         u_box_1d(0, input_size, &box);
 290         num_work_groups_start = ctx_->transfer_map(ctx_,
 291                         (struct pipe_resource*)shader->kernel_param,
 292                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 293                         &box, &transfer);
 294         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 295         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 296         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 297
 298         /* Copy the work group size */
 299         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 300
 301         /* Copy the global size */
 302         for (i = 0; i < 3; i++) {
 303                 global_size_start[i] = grid_layout[i] * block_layout[i];
 304         }
 305
 306         /* Copy the local dimensions */
 307         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 308
 309         /* Copy the kernel inputs */
 310         memcpy(kernel_parameters_start, input, shader->input_size);
 311
 312         for (i = 0; i < (input_size / 4); i++) {
 313                 COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
 314                         ((unsigned*)num_work_groups_start)[i]);
 315         }
 316
 317         ctx_->transfer_unmap(ctx_, transfer);
 318
 319         /* ID=0 is reserved for the parameters */
 320         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 321                         (struct pipe_resource*)shader->kernel_param);
 322 }
 323
 324 static void evergreen_emit_direct_dispatch(
 325                 struct r600_context *rctx,
 326                 const uint *block_layout, const uint *grid_layout)
 327 {
 328         int i;
 329         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 330         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 331         unsigned num_waves;
 332         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 333         unsigned wave_divisor = (16 * num_pipes);
 334         int group_size = 1;
 335         int grid_size = 1;
 336         unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
 337
 338         /* Calculate group_size/grid_size */
 339         for (i = 0; i < 3; i++) {
 340                 group_size *= block_layout[i];
 341         }
 342
 343         for (i = 0; i < 3; i++) {
 344                 grid_size *= grid_layout[i];
 345         }
 346
 347         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 348         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 349                         wave_divisor - 1) / wave_divisor;
 350
 351         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 352                                 "%u wavefronts per thread block, "
 353                                 "allocating %u dwords lds.\n",
 354                                 num_pipes, num_waves, lds_size);
 355
 356         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 357
 358         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 359         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 360         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 361         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 362
 363         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 364                                                                 group_size);
 365
 366         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 367         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 368         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 369         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 370
 371         if (rctx->b.chip_class < CAYMAN) {
 372                 assert(lds_size <= 8192);
 373         } else {
 374                 /* Cayman appears to have a slightly smaller limit, see the
 375                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 376                 assert(lds_size <= 8160);
 377         }
 378
 379         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 380                                         lds_size | (num_waves << 14));
 381
 382         /* Dispatch packet */
 383         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 384         radeon_emit(cs, grid_layout[0]);
 385         radeon_emit(cs, grid_layout[1]);
 386         radeon_emit(cs, grid_layout[2]);
 387         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 388         radeon_emit(cs, 1);
 389 }
 390
 391 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 392                 const uint *grid_layout)
 393 {
 394         struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 395         int i;
 396
 397         /* make sure that the gfx ring is only one active */
 398         if (ctx->b.rings.dma.cs) {
 399                 ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
 400         }
 401
 402         /* Initialize all the compute-related registers.
 403          *
 404          * See evergreen_init_atom_start_compute_cs() in this file for the list
 405          * of registers initialized by the start_compute_cs_cmd atom.
 406          */
 407         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 408
 409         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 410         r600_flush_emit(ctx);
 411
 412         /* Emit colorbuffers. */
 413         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 414         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 415                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 416                 unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
 417                                                        (struct r600_resource*)cb->base.texture,
 418                                                        RADEON_USAGE_READWRITE);
 419
 420                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 421                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 422                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 423                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 424                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 425                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 426                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 427                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 428
 429                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 430                 radeon_emit(cs, reloc);
 431
 432                 if (!ctx->keep_tiling_flags) {
 433                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 434                         radeon_emit(cs, reloc);
 435                 }
 436
 437                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 438                 radeon_emit(cs, reloc);
 439         }
 440         if (ctx->keep_tiling_flags) {
 441                 for (; i < 8 ; i++) {
 442                         r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 443                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 444                 }
 445                 for (; i < 12; i++) {
 446                         r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 447                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 448                 }
 449         }
 450
 451         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 452         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 453                                         ctx->compute_cb_target_mask);
 454
 455
 456         /* Emit vertex buffer state */
 457         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 458         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 459
 460         /* Emit constant buffer state */
 461         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 462
 463         /* Emit compute shader state */
 464         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 465
 466         /* Emit dispatch state and dispatch packet */
 467         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 468
 469         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 470          */
 471         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 472                       R600_CONTEXT_INV_VERTEX_CACHE |
 473                       R600_CONTEXT_INV_TEX_CACHE;
 474         r600_flush_emit(ctx);
 475         ctx->b.flags = 0;
 476
 477         if (ctx->b.chip_class >= CAYMAN) {
 478                 ctx->skip_surface_sync_on_next_cs_flush = true;
 479         }
 480
 481 #if 0
 482         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 483         for (i = 0; i < cs->cdw; i++) {
 484                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 485         }
 486 #endif
 487
 488 }
 489
 490
 491 /**
 492  * Emit function for r600_cs_shader_state atom
 493  */
 494 void evergreen_emit_cs_shader(
 495                 struct r600_context *rctx,
 496                 struct r600_atom *atom)
 497 {
 498         struct r600_cs_shader_state *state =
 499                                         (struct r600_cs_shader_state*)atom;
 500         struct r600_pipe_compute *shader = state->shader;
 501         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 502         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 503         uint64_t va;
 504
 505         va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b);
 506
 507         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 508         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 509         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 510                         S_0288D4_NUM_GPRS(kernel->bc.ngpr)
 511                         | S_0288D4_STACK_SIZE(kernel->bc.nstack));
 512         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 513
 514         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 515         radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
 516                                                         kernel->code_bo, RADEON_USAGE_READ));
 517 }
 518
 519 static void evergreen_launch_grid(
 520                 struct pipe_context *ctx_,
 521                 const uint *block_layout, const uint *grid_layout,
 522                 uint32_t pc, const void *input)
 523 {
 524         struct r600_context *ctx = (struct r600_context *)ctx_;
 525
 526         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 527         struct r600_kernel *kernel = &shader->kernels[pc];
 528
 529         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 530
 531 #ifdef HAVE_OPENCL
 532
 533         if (!kernel->code_bo) {
 534                 void *p;
 535                 struct r600_bytecode *bc = &kernel->bc;
 536                 LLVMModuleRef mod = kernel->llvm_module;
 537                 boolean use_kill = false;
 538                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 539                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 540                 unsigned sb_disasm = use_sb ||
 541                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 542
 543                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 544                            ctx->screen->has_compressed_msaa_texturing);
 545                 bc->type = TGSI_PROCESSOR_COMPUTE;
 546                 bc->isa = ctx->isa;
 547                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 548
 549                 if (dump && !sb_disasm) {
 550                         r600_bytecode_disasm(bc);
 551                 } else if ((dump && sb_disasm) || use_sb) {
 552                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 553                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 554                 }
 555
 556                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 557                                                         kernel->bc.ndw * 4);
 558                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 559                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 560                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 561         }
 562 #endif
 563         shader->active_kernel = kernel;
 564         ctx->cs_shader_state.kernel_index = pc;
 565         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 566         compute_emit_cs(ctx, block_layout, grid_layout);
 567 }
 568
 569 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 570                 unsigned start, unsigned count,
 571                 struct pipe_surface ** surfaces)
 572 {
 573         struct r600_context *ctx = (struct r600_context *)ctx_;
 574         struct r600_surface **resources = (struct r600_surface **)surfaces;
 575
 576         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 577                         start, count);
 578
 579         for (int i = 0; i < count; i++) {
 580                 /* The First two vertex buffers are reserved for parameters and
 581                  * global buffers. */
 582                 unsigned vtx_id = 2 + i;
 583                 if (resources[i]) {
 584                         struct r600_resource_global *buffer =
 585                                 (struct r600_resource_global*)
 586                                 resources[i]->base.texture;
 587                         if (resources[i]->base.writable) {
 588                                 assert(i+1 < 12);
 589
 590                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 591                                 (struct r600_resource *)resources[i]->base.texture,
 592                                 buffer->chunk->start_in_dw*4,
 593                                 resources[i]->base.texture->width0);
 594                         }
 595
 596                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 597                                         buffer->chunk->start_in_dw * 4,
 598                                         resources[i]->base.texture);
 599                 }
 600         }
 601 }
 602
 603 void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 604                 unsigned start_slot, unsigned count,
 605                 struct pipe_sampler_view **views)
 606 {
 607         struct r600_pipe_sampler_view **resource =
 608                 (struct r600_pipe_sampler_view **)views;
 609
 610         for (int i = 0; i < count; i++) {
 611                 if (resource[i]) {
 612                         assert(i+1 < 12);
 613                         /* XXX: Implement */
 614                         assert(!"Compute samplers not implemented.");
 615                         ///FETCH0 = VTX0 (param buffer),
 616                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 617                 }
 618         }
 619 }
 620
 621
 622 static void evergreen_set_global_binding(
 623         struct pipe_context *ctx_, unsigned first, unsigned n,
 624         struct pipe_resource **resources,
 625         uint32_t **handles)
 626 {
 627         struct r600_context *ctx = (struct r600_context *)ctx_;
 628         struct compute_memory_pool *pool = ctx->screen->global_pool;
 629         struct r600_resource_global **buffers =
 630                 (struct r600_resource_global **)resources;
 631
 632         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 633                         first, n);
 634
 635         if (!resources) {
 636                 /* XXX: Unset */
 637                 return;
 638         }
 639
 640         compute_memory_finalize_pending(pool, ctx_);
 641
 642         for (int i = 0; i < n; i++)
 643         {
 644                 assert(resources[i]->target == PIPE_BUFFER);
 645                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 646
 647                 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
 648         }
 649
 650         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 651         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 652                                 (struct pipe_resource*)pool->bo);
 653 }
 654
 655 /**
 656  * This function initializes all the compute specific registers that need to
 657  * be initialized for each compute command stream.  Registers that are common
 658  * to both compute and 3D will be initialized at the beginning of each compute
 659  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 660  * packet requires that the shader type bit be set, we must initialize all
 661  * context registers needed for compute in this function.  The registers
 662  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 663  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 664  * on the GPU family.
 665  */
 666 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 667 {
 668         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 669         int num_threads;
 670         int num_stack_entries;
 671
 672         /* since all required registers are initialised in the
 673          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 674          */
 675         r600_init_command_buffer(cb, 256);
 676         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 677
 678         /* This must be first. */
 679         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 680         r600_store_value(cb, 0x80000000);
 681         r600_store_value(cb, 0x80000000);
 682
 683         /* We're setting config registers here. */
 684         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 685         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 686
 687         switch (ctx->b.family) {
 688         case CHIP_CEDAR:
 689         default:
 690                 num_threads = 128;
 691                 num_stack_entries = 256;
 692                 break;
 693         case CHIP_REDWOOD:
 694                 num_threads = 128;
 695                 num_stack_entries = 256;
 696                 break;
 697         case CHIP_JUNIPER:
 698                 num_threads = 128;
 699                 num_stack_entries = 512;
 700                 break;
 701         case CHIP_CYPRESS:
 702         case CHIP_HEMLOCK:
 703                 num_threads = 128;
 704                 num_stack_entries = 512;
 705                 break;
 706         case CHIP_PALM:
 707                 num_threads = 128;
 708                 num_stack_entries = 256;
 709                 break;
 710         case CHIP_SUMO:
 711                 num_threads = 128;
 712                 num_stack_entries = 256;
 713                 break;
 714         case CHIP_SUMO2:
 715                 num_threads = 128;
 716                 num_stack_entries = 512;
 717                 break;
 718         case CHIP_BARTS:
 719                 num_threads = 128;
 720                 num_stack_entries = 512;
 721                 break;
 722         case CHIP_TURKS:
 723                 num_threads = 128;
 724                 num_stack_entries = 256;
 725                 break;
 726         case CHIP_CAICOS:
 727                 num_threads = 128;
 728                 num_stack_entries = 256;
 729                 break;
 730         }
 731
 732         /* Config Registers */
 733         if (ctx->b.chip_class < CAYMAN)
 734                 evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 735                                            ctx->screen->b.info.drm_minor);
 736         else
 737                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 738                                         ctx->screen->b.info.drm_minor);
 739
 740         /* The primitive type always needs to be POINTLIST for compute. */
 741         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 742                                                 V_008958_DI_PT_POINTLIST);
 743
 744         if (ctx->b.chip_class < CAYMAN) {
 745
 746                 /* These registers control which simds can be used by each stage.
 747                  * The default for these registers is 0xffffffff, which means
 748                  * all simds are available for each stage.  It's possible we may
 749                  * want to play around with these in the future, but for now
 750                  * the default value is fine.
 751                  *
 752                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 753                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 754                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 755                  */
 756
 757                 /* XXX: We may need to adjust the thread and stack resouce
 758                  * values for 3D/compute interop */
 759
 760                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 761
 762                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 763                  * Set the number of threads used by the PS/VS/GS/ES stage to
 764                  * 0.
 765                  */
 766                 r600_store_value(cb, 0);
 767
 768                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 769                  * Set the number of threads used by the CS (aka LS) stage to
 770                  * the maximum number of threads and set the number of threads
 771                  * for the HS stage to 0. */
 772                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 773
 774                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 775                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 776                 r600_store_value(cb, 0);
 777
 778                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 779                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 780                 r600_store_value(cb, 0);
 781
 782                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 783                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 784                  * set it to the maximum value for the CS (aka LS) stage. */
 785                 r600_store_value(cb,
 786                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 787         }
 788         /* Give the compute shader all the available LDS space.
 789          * NOTE: This only sets the maximum number of dwords that a compute
 790          * shader can allocate.  When a shader is executed, we still need to
 791          * allocate the appropriate amount of LDS dwords using the
 792          * CM_R_0288E8_SQ_LDS_ALLOC register.
 793          */
 794         if (ctx->b.chip_class < CAYMAN) {
 795                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 796                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 797         } else {
 798                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 799                         S_0286FC_NUM_PS_LDS(0) |
 800                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 801         }
 802
 803         /* Context Registers */
 804
 805         if (ctx->b.chip_class < CAYMAN) {
 806                 /* workaround for hw issues with dyn gpr - must set all limits
 807                  * to 240 instead of 0, 0x1e == 240 / 8
 808                  */
 809                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 810                                 S_028838_PS_GPRS(0x1e) |
 811                                 S_028838_VS_GPRS(0x1e) |
 812                                 S_028838_GS_GPRS(0x1e) |
 813                                 S_028838_ES_GPRS(0x1e) |
 814                                 S_028838_HS_GPRS(0x1e) |
 815                                 S_028838_LS_GPRS(0x1e));
 816         }
 817
 818         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 819         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 820                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 821
 822         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 823
 824         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 825                                                 S_0286E8_TID_IN_GROUP_ENA
 826                                                 | S_0286E8_TGID_ENA
 827                                                 | S_0286E8_DISABLE_INDEX_PACK)
 828                                                 ;
 829
 830         /* The LOOP_CONST registers are an optimizations for loops that allows
 831          * you to store the initial counter, increment value, and maximum
 832          * counter value in a register so that hardware can calculate the
 833          * correct number of iterations for the loop, so that you don't need
 834          * to have the loop counter in your shader code.  We don't currently use
 835          * this optimization, so we must keep track of the counter in the
 836          * shader and use a break instruction to exit loops.  However, the
 837          * hardware will still uses this register to determine when to exit a
 838          * loop, so we need to initialize the counter to 0, set the increment
 839          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 840          * is the maximum value allowed.  This gives us a maximum of 4096
 841          * iterations for our loops, but hopefully our break instruction will
 842          * execute before some time before the 4096th iteration.
 843          */
 844         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 845 }
 846
 847 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 848 {
 849         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 850         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 851         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 852 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 853         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 854         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 855         ctx->b.b.launch_grid = evergreen_launch_grid;
 856
 857         /* We always use at least one vertex buffer for parameters (id = 1)*/
 858         ctx->cs_vertex_buffer_state.enabled_mask =
 859         ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
 860 }
 861
 862 struct pipe_resource *r600_compute_global_buffer_create(
 863         struct pipe_screen *screen,
 864         const struct pipe_resource *templ)
 865 {
 866         struct r600_resource_global* result = NULL;
 867         struct r600_screen* rscreen = NULL;
 868         int size_in_dw = 0;
 869
 870         assert(templ->target == PIPE_BUFFER);
 871         assert(templ->bind & PIPE_BIND_GLOBAL);
 872         assert(templ->array_size == 1 || templ->array_size == 0);
 873         assert(templ->depth0 == 1 || templ->depth0 == 0);
 874         assert(templ->height0 == 1 || templ->height0 == 0);
 875
 876         result = (struct r600_resource_global*)
 877         CALLOC(sizeof(struct r600_resource_global), 1);
 878         rscreen = (struct r600_screen*)screen;
 879
 880         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 881         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 882                         templ->array_size);
 883
 884         result->base.b.vtbl = &r600_global_buffer_vtbl;
 885         result->base.b.b.screen = screen;
 886         result->base.b.b = *templ;
 887         pipe_reference_init(&result->base.b.b.reference, 1);
 888
 889         size_in_dw = (templ->width0+3) / 4;
 890
 891         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 892
 893         if (result->chunk == NULL)
 894         {
 895                 free(result);
 896                 return NULL;
 897         }
 898
 899         return &result->base.b.b;
 900 }
 901
 902 void r600_compute_global_buffer_destroy(
 903         struct pipe_screen *screen,
 904         struct pipe_resource *res)
 905 {
 906         struct r600_resource_global* buffer = NULL;
 907         struct r600_screen* rscreen = NULL;
 908
 909         assert(res->target == PIPE_BUFFER);
 910         assert(res->bind & PIPE_BIND_GLOBAL);
 911
 912         buffer = (struct r600_resource_global*)res;
 913         rscreen = (struct r600_screen*)screen;
 914
 915         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 916
 917         buffer->chunk = NULL;
 918         free(res);
 919 }
 920
 921 void *r600_compute_global_transfer_map(
 922         struct pipe_context *ctx_,
 923         struct pipe_resource *resource,
 924         unsigned level,
 925         unsigned usage,
 926         const struct pipe_box *box,
 927         struct pipe_transfer **ptransfer)
 928 {
 929         struct r600_context *rctx = (struct r600_context*)ctx_;
 930         struct compute_memory_pool *pool = rctx->screen->global_pool;
 931         struct r600_resource_global* buffer =
 932                 (struct r600_resource_global*)resource;
 933
 934         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 935                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 936                         "width = %u, height = %u, depth = %u)\n", level, usage,
 937                         box->x, box->y, box->z, box->width, box->height,
 938                         box->depth);
 939         COMPUTE_DBG(rctx->screen, "Buffer id = %u offset = "
 940                 "%u (box.x)\n", buffer->chunk->id, box->x);
 941
 942
 943         compute_memory_finalize_pending(pool, ctx_);
 944
 945         assert(resource->target == PIPE_BUFFER);
 946         assert(resource->bind & PIPE_BIND_GLOBAL);
 947         assert(box->x >= 0);
 948         assert(box->y == 0);
 949         assert(box->z == 0);
 950
 951         ///TODO: do it better, mapping is not possible if the pool is too big
 952         return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
 953                         box->x + (buffer->chunk->start_in_dw * 4),
 954                         box->width, usage, ptransfer);
 955 }
 956
 957 void r600_compute_global_transfer_unmap(
 958         struct pipe_context *ctx_,
 959         struct pipe_transfer* transfer)
 960 {
 961         /* struct r600_resource_global are not real resources, they just map
 962          * to an offset within the compute memory pool.  The function
 963          * r600_compute_global_transfer_map() maps the memory pool
 964          * resource rather than the struct r600_resource_global passed to
 965          * it as an argument and then initalizes ptransfer->resource with
 966          * the memory pool resource (via pipe_buffer_map_range).
 967          * When transfer_unmap is called it uses the memory pool's
 968          * vtable which calls r600_buffer_transfer_map() rather than
 969          * this function.
 970          */
 971         assert (!"This function should not be called");
 972 }
 973
 974 void r600_compute_global_transfer_flush_region(
 975         struct pipe_context *ctx_,
 976         struct pipe_transfer *transfer,
 977         const struct pipe_box *box)
 978 {
 979         assert(0 && "TODO");
 980 }
 981
 982 void r600_compute_global_transfer_inline_write(
 983         struct pipe_context *pipe,
 984         struct pipe_resource *resource,
 985         unsigned level,
 986         unsigned usage,
 987         const struct pipe_box *box,
 988         const void *data,
 989         unsigned stride,
 990         unsigned layer_stride)
 991 {
 992         assert(0 && "TODO");
 993 }