src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         r600_mark_atom_dirty(rctx, &state->atom);
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195
 196 void *evergreen_create_compute_state(
 197         struct pipe_context *ctx_,
 198         const const struct pipe_compute_state *cso)
 199 {
 200         struct r600_context *ctx = (struct r600_context *)ctx_;
 201         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const char *code;
 205         void *p;
 206         boolean use_kill;
 207
 208         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211         radeon_shader_binary_init(&shader->binary);
 212         radeon_elf_read(code, header->num_bytes, &shader->binary);
 213         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 214
 215         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 216                                                         shader->bc.ndw * 4);
 217         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 218         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 219         ctx->b.ws->buffer_unmap(shader->code_bo->buf);
 220 #endif
 221
 222         shader->ctx = ctx;
 223         shader->local_size = cso->req_local_mem;
 224         shader->private_size = cso->req_private_mem;
 225         shader->input_size = cso->req_input_mem;
 226
 227         return shader;
 228 }
 229
 230 void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 231 {
 232         struct r600_context *ctx = (struct r600_context *)ctx_;
 233         COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
 234         struct r600_pipe_compute *shader = state;
 235
 236         if (!shader)
 237                 return;
 238
 239 #ifdef HAVE_OPENCL
 240         radeon_shader_binary_clean(&shader->binary);
 241         r600_destroy_shader(&shader->bc);
 242
 243         /* TODO destroy shader->code_bo, shader->const_bo
 244          * we'll need something like r600_buffer_free */
 245 #endif
 246         FREE(shader);
 247 }
 248
 249 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 250 {
 251         struct r600_context *ctx = (struct r600_context *)ctx_;
 252
 253         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 254
 255         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 256 }
 257
 258 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 259  * kernel parameters there are implicit parameters that need to be stored
 260  * in the vertex buffer as well.  Here is how these parameters are organized in
 261  * the buffer:
 262  *
 263  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 264  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 265  * DWORDS 6-8: Number of work items within each work group in each dimension
 266  *             (x,y,z)
 267  * DWORDS 9+ : Kernel parameters
 268  */
 269 void evergreen_compute_upload_input(
 270         struct pipe_context *ctx_,
 271         const uint *block_layout,
 272         const uint *grid_layout,
 273         const void *input)
 274 {
 275         struct r600_context *ctx = (struct r600_context *)ctx_;
 276         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 277         unsigned i;
 278         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 279          * parameters.
 280          */
 281         unsigned input_size = shader->input_size + 36;
 282         uint32_t * num_work_groups_start;
 283         uint32_t * global_size_start;
 284         uint32_t * local_size_start;
 285         uint32_t * kernel_parameters_start;
 286         struct pipe_box box;
 287         struct pipe_transfer *transfer = NULL;
 288
 289         if (shader->input_size == 0) {
 290                 return;
 291         }
 292
 293         if (!shader->kernel_param) {
 294                 /* Add space for the grid dimensions */
 295                 shader->kernel_param = (struct r600_resource *)
 296                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 297                                         PIPE_USAGE_IMMUTABLE, input_size);
 298         }
 299
 300         u_box_1d(0, input_size, &box);
 301         num_work_groups_start = ctx_->transfer_map(ctx_,
 302                         (struct pipe_resource*)shader->kernel_param,
 303                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 304                         &box, &transfer);
 305         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 306         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 307         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 308
 309         /* Copy the work group size */
 310         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 311
 312         /* Copy the global size */
 313         for (i = 0; i < 3; i++) {
 314                 global_size_start[i] = grid_layout[i] * block_layout[i];
 315         }
 316
 317         /* Copy the local dimensions */
 318         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 319
 320         /* Copy the kernel inputs */
 321         memcpy(kernel_parameters_start, input, shader->input_size);
 322
 323         for (i = 0; i < (input_size / 4); i++) {
 324                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 325                         ((unsigned*)num_work_groups_start)[i]);
 326         }
 327
 328         ctx_->transfer_unmap(ctx_, transfer);
 329
 330         /* ID=0 is reserved for the parameters */
 331         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 332                         (struct pipe_resource*)shader->kernel_param);
 333 }
 334
 335 static void evergreen_emit_direct_dispatch(
 336                 struct r600_context *rctx,
 337                 const uint *block_layout, const uint *grid_layout)
 338 {
 339         int i;
 340         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 341         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 342         unsigned num_waves;
 343         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 344         unsigned wave_divisor = (16 * num_pipes);
 345         int group_size = 1;
 346         int grid_size = 1;
 347         unsigned lds_size = shader->local_size / 4 +
 348                 shader->bc.nlds_dw;
 349
 350
 351         /* Calculate group_size/grid_size */
 352         for (i = 0; i < 3; i++) {
 353                 group_size *= block_layout[i];
 354         }
 355
 356         for (i = 0; i < 3; i++) {
 357                 grid_size *= grid_layout[i];
 358         }
 359
 360         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 361         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 362                         wave_divisor - 1) / wave_divisor;
 363
 364         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 365                                 "%u wavefronts per thread block, "
 366                                 "allocating %u dwords lds.\n",
 367                                 num_pipes, num_waves, lds_size);
 368
 369         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 370
 371         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 372         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 373         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 374         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 375
 376         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 377                                                                 group_size);
 378
 379         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 380         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 381         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 382         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 383
 384         if (rctx->b.chip_class < CAYMAN) {
 385                 assert(lds_size <= 8192);
 386         } else {
 387                 /* Cayman appears to have a slightly smaller limit, see the
 388                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 389                 assert(lds_size <= 8160);
 390         }
 391
 392         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 393                                         lds_size | (num_waves << 14));
 394
 395         /* Dispatch packet */
 396         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 397         radeon_emit(cs, grid_layout[0]);
 398         radeon_emit(cs, grid_layout[1]);
 399         radeon_emit(cs, grid_layout[2]);
 400         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 401         radeon_emit(cs, 1);
 402 }
 403
 404 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 405                 const uint *grid_layout)
 406 {
 407         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 408         unsigned i;
 409
 410         /* make sure that the gfx ring is only one active */
 411         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 412                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 413         }
 414
 415         /* Initialize all the compute-related registers.
 416          *
 417          * See evergreen_init_atom_start_compute_cs() in this file for the list
 418          * of registers initialized by the start_compute_cs_cmd atom.
 419          */
 420         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 421
 422         /* emit config state */
 423         if (ctx->b.chip_class == EVERGREEN)
 424                 r600_emit_atom(ctx, &ctx->config_state.atom);
 425
 426         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 427         r600_flush_emit(ctx);
 428
 429         /* Emit colorbuffers. */
 430         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 431         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 432                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 433                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 434                                                        (struct r600_resource*)cb->base.texture,
 435                                                        RADEON_USAGE_READWRITE,
 436                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 437
 438                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 439                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 440                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 441                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 442                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 443                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 444                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 445                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 446
 447                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 448                 radeon_emit(cs, reloc);
 449
 450                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 451                 radeon_emit(cs, reloc);
 452         }
 453         for (; i < 8 ; i++)
 454                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 455                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 456         for (; i < 12; i++)
 457                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 458                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 459
 460         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 461         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 462                                         ctx->compute_cb_target_mask);
 463
 464
 465         /* Emit vertex buffer state */
 466         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 467         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 468
 469         /* Emit constant buffer state */
 470         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 471
 472         /* Emit sampler state */
 473         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 474
 475         /* Emit sampler view (texture resource) state */
 476         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 477
 478         /* Emit compute shader state */
 479         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 480
 481         /* Emit dispatch state and dispatch packet */
 482         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 483
 484         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 485          */
 486         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 487                       R600_CONTEXT_INV_VERTEX_CACHE |
 488                       R600_CONTEXT_INV_TEX_CACHE;
 489         r600_flush_emit(ctx);
 490         ctx->b.flags = 0;
 491
 492         if (ctx->b.chip_class >= CAYMAN) {
 493                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 494                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 495                 /* DEALLOC_STATE prevents the GPU from hanging when a
 496                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 497                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 498                  */
 499                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 500                 cs->buf[cs->cdw++] = 0;
 501         }
 502
 503 #if 0
 504         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 505         for (i = 0; i < cs->cdw; i++) {
 506                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 507         }
 508 #endif
 509
 510 }
 511
 512
 513 /**
 514  * Emit function for r600_cs_shader_state atom
 515  */
 516 void evergreen_emit_cs_shader(
 517                 struct r600_context *rctx,
 518                 struct r600_atom *atom)
 519 {
 520         struct r600_cs_shader_state *state =
 521                                         (struct r600_cs_shader_state*)atom;
 522         struct r600_pipe_compute *shader = state->shader;
 523         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 524         uint64_t va;
 525         struct r600_resource *code_bo;
 526         unsigned ngpr, nstack;
 527
 528         code_bo = shader->code_bo;
 529         va = shader->code_bo->gpu_address + state->pc;
 530         ngpr = shader->bc.ngpr;
 531         nstack = shader->bc.nstack;
 532
 533         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 534         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 535         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 536                         S_0288D4_NUM_GPRS(ngpr)
 537                         | S_0288D4_STACK_SIZE(nstack));
 538         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 539
 540         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 541         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 542                                               code_bo, RADEON_USAGE_READ,
 543                                               RADEON_PRIO_USER_SHADER));
 544 }
 545
 546 static void evergreen_launch_grid(
 547                 struct pipe_context *ctx_, const struct pipe_grid_info *info)
 548 {
 549         struct r600_context *ctx = (struct r600_context *)ctx_;
 550 #ifdef HAVE_OPENCL
 551         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 552         boolean use_kill;
 553
 554         ctx->cs_shader_state.pc = info->pc;
 555         /* Get the config information for this kernel. */
 556         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 557                                   info->pc, &use_kill);
 558 #endif
 559
 560         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 561
 562
 563         evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
 564         compute_emit_cs(ctx, info->block, info->grid);
 565 }
 566
 567 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 568                 unsigned start, unsigned count,
 569                 struct pipe_surface ** surfaces)
 570 {
 571         struct r600_context *ctx = (struct r600_context *)ctx_;
 572         struct r600_surface **resources = (struct r600_surface **)surfaces;
 573
 574         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 575                         start, count);
 576
 577         for (unsigned i = 0; i < count; i++) {
 578                 /* The First two vertex buffers are reserved for parameters and
 579                  * global buffers. */
 580                 unsigned vtx_id = 2 + i;
 581                 if (resources[i]) {
 582                         struct r600_resource_global *buffer =
 583                                 (struct r600_resource_global*)
 584                                 resources[i]->base.texture;
 585                         if (resources[i]->base.writable) {
 586                                 assert(i+1 < 12);
 587
 588                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 589                                 (struct r600_resource *)resources[i]->base.texture,
 590                                 buffer->chunk->start_in_dw*4,
 591                                 resources[i]->base.texture->width0);
 592                         }
 593
 594                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 595                                         buffer->chunk->start_in_dw * 4,
 596                                         resources[i]->base.texture);
 597                 }
 598         }
 599 }
 600
 601 static void evergreen_set_global_binding(
 602         struct pipe_context *ctx_, unsigned first, unsigned n,
 603         struct pipe_resource **resources,
 604         uint32_t **handles)
 605 {
 606         struct r600_context *ctx = (struct r600_context *)ctx_;
 607         struct compute_memory_pool *pool = ctx->screen->global_pool;
 608         struct r600_resource_global **buffers =
 609                 (struct r600_resource_global **)resources;
 610         unsigned i;
 611
 612         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 613                         first, n);
 614
 615         if (!resources) {
 616                 /* XXX: Unset */
 617                 return;
 618         }
 619
 620         /* We mark these items for promotion to the pool if they
 621          * aren't already there */
 622         for (i = first; i < first + n; i++) {
 623                 struct compute_memory_item *item = buffers[i]->chunk;
 624
 625                 if (!is_item_in_pool(item))
 626                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 627         }
 628
 629         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 630                 /* XXX: Unset */
 631                 return;
 632         }
 633
 634         for (i = first; i < first + n; i++)
 635         {
 636                 uint32_t buffer_offset;
 637                 uint32_t handle;
 638                 assert(resources[i]->target == PIPE_BUFFER);
 639                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 640
 641                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 642                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 643
 644                 *(handles[i]) = util_cpu_to_le32(handle);
 645         }
 646
 647         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 648         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 649                                 (struct pipe_resource*)pool->bo);
 650 }
 651
 652 /**
 653  * This function initializes all the compute specific registers that need to
 654  * be initialized for each compute command stream.  Registers that are common
 655  * to both compute and 3D will be initialized at the beginning of each compute
 656  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 657  * packet requires that the shader type bit be set, we must initialize all
 658  * context registers needed for compute in this function.  The registers
 659  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 660  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 661  * on the GPU family.
 662  */
 663 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 664 {
 665         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 666         int num_threads;
 667         int num_stack_entries;
 668
 669         /* since all required registers are initialized in the
 670          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 671          */
 672         r600_init_command_buffer(cb, 256);
 673         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 674
 675         /* This must be first. */
 676         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 677         r600_store_value(cb, 0x80000000);
 678         r600_store_value(cb, 0x80000000);
 679
 680         /* We're setting config registers here. */
 681         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 682         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 683
 684         switch (ctx->b.family) {
 685         case CHIP_CEDAR:
 686         default:
 687                 num_threads = 128;
 688                 num_stack_entries = 256;
 689                 break;
 690         case CHIP_REDWOOD:
 691                 num_threads = 128;
 692                 num_stack_entries = 256;
 693                 break;
 694         case CHIP_JUNIPER:
 695                 num_threads = 128;
 696                 num_stack_entries = 512;
 697                 break;
 698         case CHIP_CYPRESS:
 699         case CHIP_HEMLOCK:
 700                 num_threads = 128;
 701                 num_stack_entries = 512;
 702                 break;
 703         case CHIP_PALM:
 704                 num_threads = 128;
 705                 num_stack_entries = 256;
 706                 break;
 707         case CHIP_SUMO:
 708                 num_threads = 128;
 709                 num_stack_entries = 256;
 710                 break;
 711         case CHIP_SUMO2:
 712                 num_threads = 128;
 713                 num_stack_entries = 512;
 714                 break;
 715         case CHIP_BARTS:
 716                 num_threads = 128;
 717                 num_stack_entries = 512;
 718                 break;
 719         case CHIP_TURKS:
 720                 num_threads = 128;
 721                 num_stack_entries = 256;
 722                 break;
 723         case CHIP_CAICOS:
 724                 num_threads = 128;
 725                 num_stack_entries = 256;
 726                 break;
 727         }
 728
 729         /* Config Registers */
 730         if (ctx->b.chip_class < CAYMAN)
 731                 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
 732                                            ctx->screen->b.info.drm_minor);
 733         else
 734                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 735                                         ctx->screen->b.info.drm_minor);
 736
 737         /* The primitive type always needs to be POINTLIST for compute. */
 738         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 739                                                 V_008958_DI_PT_POINTLIST);
 740
 741         if (ctx->b.chip_class < CAYMAN) {
 742
 743                 /* These registers control which simds can be used by each stage.
 744                  * The default for these registers is 0xffffffff, which means
 745                  * all simds are available for each stage.  It's possible we may
 746                  * want to play around with these in the future, but for now
 747                  * the default value is fine.
 748                  *
 749                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 750                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 751                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 752                  */
 753
 754                 /* XXX: We may need to adjust the thread and stack resource
 755                  * values for 3D/compute interop */
 756
 757                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 758
 759                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 760                  * Set the number of threads used by the PS/VS/GS/ES stage to
 761                  * 0.
 762                  */
 763                 r600_store_value(cb, 0);
 764
 765                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 766                  * Set the number of threads used by the CS (aka LS) stage to
 767                  * the maximum number of threads and set the number of threads
 768                  * for the HS stage to 0. */
 769                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 770
 771                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 772                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 773                 r600_store_value(cb, 0);
 774
 775                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 776                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 777                 r600_store_value(cb, 0);
 778
 779                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 780                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 781                  * set it to the maximum value for the CS (aka LS) stage. */
 782                 r600_store_value(cb,
 783                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 784         }
 785         /* Give the compute shader all the available LDS space.
 786          * NOTE: This only sets the maximum number of dwords that a compute
 787          * shader can allocate.  When a shader is executed, we still need to
 788          * allocate the appropriate amount of LDS dwords using the
 789          * CM_R_0288E8_SQ_LDS_ALLOC register.
 790          */
 791         if (ctx->b.chip_class < CAYMAN) {
 792                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 793                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 794         } else {
 795                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 796                         S_0286FC_NUM_PS_LDS(0) |
 797                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 798         }
 799
 800         /* Context Registers */
 801
 802         if (ctx->b.chip_class < CAYMAN) {
 803                 /* workaround for hw issues with dyn gpr - must set all limits
 804                  * to 240 instead of 0, 0x1e == 240 / 8
 805                  */
 806                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 807                                 S_028838_PS_GPRS(0x1e) |
 808                                 S_028838_VS_GPRS(0x1e) |
 809                                 S_028838_GS_GPRS(0x1e) |
 810                                 S_028838_ES_GPRS(0x1e) |
 811                                 S_028838_HS_GPRS(0x1e) |
 812                                 S_028838_LS_GPRS(0x1e));
 813         }
 814
 815         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 816         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 817                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 818
 819         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 820
 821         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 822                                                 S_0286E8_TID_IN_GROUP_ENA
 823                                                 | S_0286E8_TGID_ENA
 824                                                 | S_0286E8_DISABLE_INDEX_PACK)
 825                                                 ;
 826
 827         /* The LOOP_CONST registers are an optimizations for loops that allows
 828          * you to store the initial counter, increment value, and maximum
 829          * counter value in a register so that hardware can calculate the
 830          * correct number of iterations for the loop, so that you don't need
 831          * to have the loop counter in your shader code.  We don't currently use
 832          * this optimization, so we must keep track of the counter in the
 833          * shader and use a break instruction to exit loops.  However, the
 834          * hardware will still uses this register to determine when to exit a
 835          * loop, so we need to initialize the counter to 0, set the increment
 836          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 837          * is the maximum value allowed.  This gives us a maximum of 4096
 838          * iterations for our loops, but hopefully our break instruction will
 839          * execute before some time before the 4096th iteration.
 840          */
 841         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 842 }
 843
 844 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 845 {
 846         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 847         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 848         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 849 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 850         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 851         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 852         ctx->b.b.launch_grid = evergreen_launch_grid;
 853
 854 }
 855
 856 struct pipe_resource *r600_compute_global_buffer_create(
 857         struct pipe_screen *screen,
 858         const struct pipe_resource *templ)
 859 {
 860         struct r600_resource_global* result = NULL;
 861         struct r600_screen* rscreen = NULL;
 862         int size_in_dw = 0;
 863
 864         assert(templ->target == PIPE_BUFFER);
 865         assert(templ->bind & PIPE_BIND_GLOBAL);
 866         assert(templ->array_size == 1 || templ->array_size == 0);
 867         assert(templ->depth0 == 1 || templ->depth0 == 0);
 868         assert(templ->height0 == 1 || templ->height0 == 0);
 869
 870         result = (struct r600_resource_global*)
 871         CALLOC(sizeof(struct r600_resource_global), 1);
 872         rscreen = (struct r600_screen*)screen;
 873
 874         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 875         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 876                         templ->array_size);
 877
 878         result->base.b.vtbl = &r600_global_buffer_vtbl;
 879         result->base.b.b = *templ;
 880         result->base.b.b.screen = screen;
 881         pipe_reference_init(&result->base.b.b.reference, 1);
 882
 883         size_in_dw = (templ->width0+3) / 4;
 884
 885         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 886
 887         if (result->chunk == NULL)
 888         {
 889                 free(result);
 890                 return NULL;
 891         }
 892
 893         return &result->base.b.b;
 894 }
 895
 896 void r600_compute_global_buffer_destroy(
 897         struct pipe_screen *screen,
 898         struct pipe_resource *res)
 899 {
 900         struct r600_resource_global* buffer = NULL;
 901         struct r600_screen* rscreen = NULL;
 902
 903         assert(res->target == PIPE_BUFFER);
 904         assert(res->bind & PIPE_BIND_GLOBAL);
 905
 906         buffer = (struct r600_resource_global*)res;
 907         rscreen = (struct r600_screen*)screen;
 908
 909         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 910
 911         buffer->chunk = NULL;
 912         free(res);
 913 }
 914
 915 void *r600_compute_global_transfer_map(
 916         struct pipe_context *ctx_,
 917         struct pipe_resource *resource,
 918         unsigned level,
 919         unsigned usage,
 920         const struct pipe_box *box,
 921         struct pipe_transfer **ptransfer)
 922 {
 923         struct r600_context *rctx = (struct r600_context*)ctx_;
 924         struct compute_memory_pool *pool = rctx->screen->global_pool;
 925         struct r600_resource_global* buffer =
 926                 (struct r600_resource_global*)resource;
 927
 928         struct compute_memory_item *item = buffer->chunk;
 929         struct pipe_resource *dst = NULL;
 930         unsigned offset = box->x;
 931
 932         if (is_item_in_pool(item)) {
 933                 compute_memory_demote_item(pool, item, ctx_);
 934         }
 935         else {
 936                 if (item->real_buffer == NULL) {
 937                         item->real_buffer =
 938                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 939                 }
 940         }
 941
 942         dst = (struct pipe_resource*)item->real_buffer;
 943
 944         if (usage & PIPE_TRANSFER_READ)
 945                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 946
 947         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 948                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 949                         "width = %u, height = %u, depth = %u)\n", level, usage,
 950                         box->x, box->y, box->z, box->width, box->height,
 951                         box->depth);
 952         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 953                 "%u (box.x)\n", item->id, box->x);
 954
 955
 956         assert(resource->target == PIPE_BUFFER);
 957         assert(resource->bind & PIPE_BIND_GLOBAL);
 958         assert(box->x >= 0);
 959         assert(box->y == 0);
 960         assert(box->z == 0);
 961
 962         ///TODO: do it better, mapping is not possible if the pool is too big
 963         return pipe_buffer_map_range(ctx_, dst,
 964                         offset, box->width, usage, ptransfer);
 965 }
 966
 967 void r600_compute_global_transfer_unmap(
 968         struct pipe_context *ctx_,
 969         struct pipe_transfer* transfer)
 970 {
 971         /* struct r600_resource_global are not real resources, they just map
 972          * to an offset within the compute memory pool.  The function
 973          * r600_compute_global_transfer_map() maps the memory pool
 974          * resource rather than the struct r600_resource_global passed to
 975          * it as an argument and then initalizes ptransfer->resource with
 976          * the memory pool resource (via pipe_buffer_map_range).
 977          * When transfer_unmap is called it uses the memory pool's
 978          * vtable which calls r600_buffer_transfer_map() rather than
 979          * this function.
 980          */
 981         assert (!"This function should not be called");
 982 }
 983
 984 void r600_compute_global_transfer_flush_region(
 985         struct pipe_context *ctx_,
 986         struct pipe_transfer *transfer,
 987         const struct pipe_box *box)
 988 {
 989         assert(0 && "TODO");
 990 }
 991
 992 void r600_compute_global_transfer_inline_write(
 993         struct pipe_context *pipe,
 994         struct pipe_resource *resource,
 995         unsigned level,
 996         unsigned usage,
 997         const struct pipe_box *box,
 998         const void *data,
 999         unsigned stride,
1000         unsigned layer_stride)
1001 {
1002         assert(0 && "TODO");
1003 }