src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         r600_mark_atom_dirty(rctx, &state->atom);
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195
 196 void *evergreen_create_compute_state(
 197         struct pipe_context *ctx_,
 198         const const struct pipe_compute_state *cso)
 199 {
 200         struct r600_context *ctx = (struct r600_context *)ctx_;
 201         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const char *code;
 205         void *p;
 206         boolean use_kill;
 207
 208         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211         radeon_shader_binary_init(&shader->binary);
 212         radeon_elf_read(code, header->num_bytes, &shader->binary);
 213         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 214
 215         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 216                                                         shader->bc.ndw * 4);
 217         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 218         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 219         ctx->b.ws->buffer_unmap(shader->code_bo->buf);
 220 #endif
 221
 222         shader->ctx = ctx;
 223         shader->local_size = cso->req_local_mem;
 224         shader->private_size = cso->req_private_mem;
 225         shader->input_size = cso->req_input_mem;
 226
 227         return shader;
 228 }
 229
 230 void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 231 {
 232         struct r600_context *ctx = (struct r600_context *)ctx_;
 233         COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
 234         struct r600_pipe_compute *shader = state;
 235
 236         if (!shader)
 237                 return;
 238
 239 #ifdef HAVE_OPENCL
 240         radeon_shader_binary_clean(&shader->binary);
 241         r600_destroy_shader(&shader->bc);
 242
 243         /* TODO destroy shader->code_bo, shader->const_bo
 244          * we'll need something like r600_buffer_free */
 245 #endif
 246         FREE(shader);
 247 }
 248
 249 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 250 {
 251         struct r600_context *ctx = (struct r600_context *)ctx_;
 252
 253         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 254
 255         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 256 }
 257
 258 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 259  * kernel parameters there are implicit parameters that need to be stored
 260  * in the vertex buffer as well.  Here is how these parameters are organized in
 261  * the buffer:
 262  *
 263  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 264  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 265  * DWORDS 6-8: Number of work items within each work group in each dimension
 266  *             (x,y,z)
 267  * DWORDS 9+ : Kernel parameters
 268  */
 269 void evergreen_compute_upload_input(
 270         struct pipe_context *ctx_,
 271         const uint *block_layout,
 272         const uint *grid_layout,
 273         const void *input)
 274 {
 275         struct r600_context *ctx = (struct r600_context *)ctx_;
 276         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 277         unsigned i;
 278         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 279          * parameters.
 280          */
 281         unsigned input_size = shader->input_size + 36;
 282         uint32_t * num_work_groups_start;
 283         uint32_t * global_size_start;
 284         uint32_t * local_size_start;
 285         uint32_t * kernel_parameters_start;
 286         struct pipe_box box;
 287         struct pipe_transfer *transfer = NULL;
 288
 289         if (shader->input_size == 0) {
 290                 return;
 291         }
 292
 293         if (!shader->kernel_param) {
 294                 /* Add space for the grid dimensions */
 295                 shader->kernel_param = (struct r600_resource *)
 296                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 297                                         PIPE_USAGE_IMMUTABLE, input_size);
 298         }
 299
 300         u_box_1d(0, input_size, &box);
 301         num_work_groups_start = ctx_->transfer_map(ctx_,
 302                         (struct pipe_resource*)shader->kernel_param,
 303                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 304                         &box, &transfer);
 305         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 306         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 307         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 308
 309         /* Copy the work group size */
 310         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 311
 312         /* Copy the global size */
 313         for (i = 0; i < 3; i++) {
 314                 global_size_start[i] = grid_layout[i] * block_layout[i];
 315         }
 316
 317         /* Copy the local dimensions */
 318         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 319
 320         /* Copy the kernel inputs */
 321         memcpy(kernel_parameters_start, input, shader->input_size);
 322
 323         for (i = 0; i < (input_size / 4); i++) {
 324                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 325                         ((unsigned*)num_work_groups_start)[i]);
 326         }
 327
 328         ctx_->transfer_unmap(ctx_, transfer);
 329
 330         /* ID=0 is reserved for the parameters */
 331         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 332                         (struct pipe_resource*)shader->kernel_param);
 333 }
 334
 335 static void evergreen_emit_direct_dispatch(
 336                 struct r600_context *rctx,
 337                 const uint *block_layout, const uint *grid_layout)
 338 {
 339         int i;
 340         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 341         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 342         unsigned num_waves;
 343         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 344         unsigned wave_divisor = (16 * num_pipes);
 345         int group_size = 1;
 346         int grid_size = 1;
 347         unsigned lds_size = shader->local_size / 4 +
 348                 shader->bc.nlds_dw;
 349
 350
 351         /* Calculate group_size/grid_size */
 352         for (i = 0; i < 3; i++) {
 353                 group_size *= block_layout[i];
 354         }
 355
 356         for (i = 0; i < 3; i++) {
 357                 grid_size *= grid_layout[i];
 358         }
 359
 360         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 361         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 362                         wave_divisor - 1) / wave_divisor;
 363
 364         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 365                                 "%u wavefronts per thread block, "
 366                                 "allocating %u dwords lds.\n",
 367                                 num_pipes, num_waves, lds_size);
 368
 369         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 370
 371         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 372         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 373         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 374         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 375
 376         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 377                                                                 group_size);
 378
 379         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 380         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 381         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 382         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 383
 384         if (rctx->b.chip_class < CAYMAN) {
 385                 assert(lds_size <= 8192);
 386         } else {
 387                 /* Cayman appears to have a slightly smaller limit, see the
 388                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 389                 assert(lds_size <= 8160);
 390         }
 391
 392         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 393                                         lds_size | (num_waves << 14));
 394
 395         /* Dispatch packet */
 396         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 397         radeon_emit(cs, grid_layout[0]);
 398         radeon_emit(cs, grid_layout[1]);
 399         radeon_emit(cs, grid_layout[2]);
 400         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 401         radeon_emit(cs, 1);
 402 }
 403
 404 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 405                 const uint *grid_layout)
 406 {
 407         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 408         unsigned i;
 409
 410         /* make sure that the gfx ring is only one active */
 411         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 412                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 413         }
 414
 415         /* Initialize all the compute-related registers.
 416          *
 417          * See evergreen_init_atom_start_compute_cs() in this file for the list
 418          * of registers initialized by the start_compute_cs_cmd atom.
 419          */
 420         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 421
 422         /* emit config state */
 423         if (ctx->b.chip_class == EVERGREEN)
 424                 r600_emit_atom(ctx, &ctx->config_state.atom);
 425
 426         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 427         r600_flush_emit(ctx);
 428
 429         /* Emit colorbuffers. */
 430         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 431         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 432                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 433                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 434                                                        (struct r600_resource*)cb->base.texture,
 435                                                        RADEON_USAGE_READWRITE,
 436                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 437
 438                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 439                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 440                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 441                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 442                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 443                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 444                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 445                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 446
 447                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 448                 radeon_emit(cs, reloc);
 449
 450                 if (!ctx->keep_tiling_flags) {
 451                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 452                         radeon_emit(cs, reloc);
 453                 }
 454
 455                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 456                 radeon_emit(cs, reloc);
 457         }
 458         if (ctx->keep_tiling_flags) {
 459                 for (; i < 8 ; i++) {
 460                         radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 461                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 462                 }
 463                 for (; i < 12; i++) {
 464                         radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 465                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 466                 }
 467         }
 468
 469         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 470         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 471                                         ctx->compute_cb_target_mask);
 472
 473
 474         /* Emit vertex buffer state */
 475         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 476         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 477
 478         /* Emit constant buffer state */
 479         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 480
 481         /* Emit sampler state */
 482         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 483
 484         /* Emit sampler view (texture resource) state */
 485         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 486
 487         /* Emit compute shader state */
 488         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 489
 490         /* Emit dispatch state and dispatch packet */
 491         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 492
 493         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 494          */
 495         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 496                       R600_CONTEXT_INV_VERTEX_CACHE |
 497                       R600_CONTEXT_INV_TEX_CACHE;
 498         r600_flush_emit(ctx);
 499         ctx->b.flags = 0;
 500
 501         if (ctx->b.chip_class >= CAYMAN) {
 502                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 503                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 504                 /* DEALLOC_STATE prevents the GPU from hanging when a
 505                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 506                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 507                  */
 508                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 509                 cs->buf[cs->cdw++] = 0;
 510         }
 511
 512 #if 0
 513         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 514         for (i = 0; i < cs->cdw; i++) {
 515                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 516         }
 517 #endif
 518
 519 }
 520
 521
 522 /**
 523  * Emit function for r600_cs_shader_state atom
 524  */
 525 void evergreen_emit_cs_shader(
 526                 struct r600_context *rctx,
 527                 struct r600_atom *atom)
 528 {
 529         struct r600_cs_shader_state *state =
 530                                         (struct r600_cs_shader_state*)atom;
 531         struct r600_pipe_compute *shader = state->shader;
 532         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 533         uint64_t va;
 534         struct r600_resource *code_bo;
 535         unsigned ngpr, nstack;
 536
 537         code_bo = shader->code_bo;
 538         va = shader->code_bo->gpu_address + state->pc;
 539         ngpr = shader->bc.ngpr;
 540         nstack = shader->bc.nstack;
 541
 542         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 543         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 544         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 545                         S_0288D4_NUM_GPRS(ngpr)
 546                         | S_0288D4_STACK_SIZE(nstack));
 547         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 548
 549         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 550         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 551                                               code_bo, RADEON_USAGE_READ,
 552                                               RADEON_PRIO_USER_SHADER));
 553 }
 554
 555 static void evergreen_launch_grid(
 556                 struct pipe_context *ctx_, const struct pipe_grid_info *info)
 557 {
 558         struct r600_context *ctx = (struct r600_context *)ctx_;
 559 #ifdef HAVE_OPENCL
 560         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 561         boolean use_kill;
 562
 563         ctx->cs_shader_state.pc = info->pc;
 564         /* Get the config information for this kernel. */
 565         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 566                                   info->pc, &use_kill);
 567 #endif
 568
 569         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 570
 571
 572         evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
 573         compute_emit_cs(ctx, info->block, info->grid);
 574 }
 575
 576 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 577                 unsigned start, unsigned count,
 578                 struct pipe_surface ** surfaces)
 579 {
 580         struct r600_context *ctx = (struct r600_context *)ctx_;
 581         struct r600_surface **resources = (struct r600_surface **)surfaces;
 582
 583         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 584                         start, count);
 585
 586         for (unsigned i = 0; i < count; i++) {
 587                 /* The First two vertex buffers are reserved for parameters and
 588                  * global buffers. */
 589                 unsigned vtx_id = 2 + i;
 590                 if (resources[i]) {
 591                         struct r600_resource_global *buffer =
 592                                 (struct r600_resource_global*)
 593                                 resources[i]->base.texture;
 594                         if (resources[i]->base.writable) {
 595                                 assert(i+1 < 12);
 596
 597                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 598                                 (struct r600_resource *)resources[i]->base.texture,
 599                                 buffer->chunk->start_in_dw*4,
 600                                 resources[i]->base.texture->width0);
 601                         }
 602
 603                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 604                                         buffer->chunk->start_in_dw * 4,
 605                                         resources[i]->base.texture);
 606                 }
 607         }
 608 }
 609
 610 static void evergreen_set_global_binding(
 611         struct pipe_context *ctx_, unsigned first, unsigned n,
 612         struct pipe_resource **resources,
 613         uint32_t **handles)
 614 {
 615         struct r600_context *ctx = (struct r600_context *)ctx_;
 616         struct compute_memory_pool *pool = ctx->screen->global_pool;
 617         struct r600_resource_global **buffers =
 618                 (struct r600_resource_global **)resources;
 619         unsigned i;
 620
 621         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 622                         first, n);
 623
 624         if (!resources) {
 625                 /* XXX: Unset */
 626                 return;
 627         }
 628
 629         /* We mark these items for promotion to the pool if they
 630          * aren't already there */
 631         for (i = first; i < first + n; i++) {
 632                 struct compute_memory_item *item = buffers[i]->chunk;
 633
 634                 if (!is_item_in_pool(item))
 635                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 636         }
 637
 638         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 639                 /* XXX: Unset */
 640                 return;
 641         }
 642
 643         for (i = first; i < first + n; i++)
 644         {
 645                 uint32_t buffer_offset;
 646                 uint32_t handle;
 647                 assert(resources[i]->target == PIPE_BUFFER);
 648                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 649
 650                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 651                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 652
 653                 *(handles[i]) = util_cpu_to_le32(handle);
 654         }
 655
 656         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 657         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 658                                 (struct pipe_resource*)pool->bo);
 659 }
 660
 661 /**
 662  * This function initializes all the compute specific registers that need to
 663  * be initialized for each compute command stream.  Registers that are common
 664  * to both compute and 3D will be initialized at the beginning of each compute
 665  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 666  * packet requires that the shader type bit be set, we must initialize all
 667  * context registers needed for compute in this function.  The registers
 668  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 669  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 670  * on the GPU family.
 671  */
 672 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 673 {
 674         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 675         int num_threads;
 676         int num_stack_entries;
 677
 678         /* since all required registers are initialized in the
 679          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 680          */
 681         r600_init_command_buffer(cb, 256);
 682         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 683
 684         /* This must be first. */
 685         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 686         r600_store_value(cb, 0x80000000);
 687         r600_store_value(cb, 0x80000000);
 688
 689         /* We're setting config registers here. */
 690         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 691         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 692
 693         switch (ctx->b.family) {
 694         case CHIP_CEDAR:
 695         default:
 696                 num_threads = 128;
 697                 num_stack_entries = 256;
 698                 break;
 699         case CHIP_REDWOOD:
 700                 num_threads = 128;
 701                 num_stack_entries = 256;
 702                 break;
 703         case CHIP_JUNIPER:
 704                 num_threads = 128;
 705                 num_stack_entries = 512;
 706                 break;
 707         case CHIP_CYPRESS:
 708         case CHIP_HEMLOCK:
 709                 num_threads = 128;
 710                 num_stack_entries = 512;
 711                 break;
 712         case CHIP_PALM:
 713                 num_threads = 128;
 714                 num_stack_entries = 256;
 715                 break;
 716         case CHIP_SUMO:
 717                 num_threads = 128;
 718                 num_stack_entries = 256;
 719                 break;
 720         case CHIP_SUMO2:
 721                 num_threads = 128;
 722                 num_stack_entries = 512;
 723                 break;
 724         case CHIP_BARTS:
 725                 num_threads = 128;
 726                 num_stack_entries = 512;
 727                 break;
 728         case CHIP_TURKS:
 729                 num_threads = 128;
 730                 num_stack_entries = 256;
 731                 break;
 732         case CHIP_CAICOS:
 733                 num_threads = 128;
 734                 num_stack_entries = 256;
 735                 break;
 736         }
 737
 738         /* Config Registers */
 739         if (ctx->b.chip_class < CAYMAN)
 740                 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
 741                                            ctx->screen->b.info.drm_minor);
 742         else
 743                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 744                                         ctx->screen->b.info.drm_minor);
 745
 746         /* The primitive type always needs to be POINTLIST for compute. */
 747         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 748                                                 V_008958_DI_PT_POINTLIST);
 749
 750         if (ctx->b.chip_class < CAYMAN) {
 751
 752                 /* These registers control which simds can be used by each stage.
 753                  * The default for these registers is 0xffffffff, which means
 754                  * all simds are available for each stage.  It's possible we may
 755                  * want to play around with these in the future, but for now
 756                  * the default value is fine.
 757                  *
 758                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 759                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 760                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 761                  */
 762
 763                 /* XXX: We may need to adjust the thread and stack resource
 764                  * values for 3D/compute interop */
 765
 766                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 767
 768                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 769                  * Set the number of threads used by the PS/VS/GS/ES stage to
 770                  * 0.
 771                  */
 772                 r600_store_value(cb, 0);
 773
 774                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 775                  * Set the number of threads used by the CS (aka LS) stage to
 776                  * the maximum number of threads and set the number of threads
 777                  * for the HS stage to 0. */
 778                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 779
 780                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 781                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 782                 r600_store_value(cb, 0);
 783
 784                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 785                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 786                 r600_store_value(cb, 0);
 787
 788                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 789                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 790                  * set it to the maximum value for the CS (aka LS) stage. */
 791                 r600_store_value(cb,
 792                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 793         }
 794         /* Give the compute shader all the available LDS space.
 795          * NOTE: This only sets the maximum number of dwords that a compute
 796          * shader can allocate.  When a shader is executed, we still need to
 797          * allocate the appropriate amount of LDS dwords using the
 798          * CM_R_0288E8_SQ_LDS_ALLOC register.
 799          */
 800         if (ctx->b.chip_class < CAYMAN) {
 801                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 802                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 803         } else {
 804                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 805                         S_0286FC_NUM_PS_LDS(0) |
 806                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 807         }
 808
 809         /* Context Registers */
 810
 811         if (ctx->b.chip_class < CAYMAN) {
 812                 /* workaround for hw issues with dyn gpr - must set all limits
 813                  * to 240 instead of 0, 0x1e == 240 / 8
 814                  */
 815                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 816                                 S_028838_PS_GPRS(0x1e) |
 817                                 S_028838_VS_GPRS(0x1e) |
 818                                 S_028838_GS_GPRS(0x1e) |
 819                                 S_028838_ES_GPRS(0x1e) |
 820                                 S_028838_HS_GPRS(0x1e) |
 821                                 S_028838_LS_GPRS(0x1e));
 822         }
 823
 824         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 825         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 826                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 827
 828         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 829
 830         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 831                                                 S_0286E8_TID_IN_GROUP_ENA
 832                                                 | S_0286E8_TGID_ENA
 833                                                 | S_0286E8_DISABLE_INDEX_PACK)
 834                                                 ;
 835
 836         /* The LOOP_CONST registers are an optimizations for loops that allows
 837          * you to store the initial counter, increment value, and maximum
 838          * counter value in a register so that hardware can calculate the
 839          * correct number of iterations for the loop, so that you don't need
 840          * to have the loop counter in your shader code.  We don't currently use
 841          * this optimization, so we must keep track of the counter in the
 842          * shader and use a break instruction to exit loops.  However, the
 843          * hardware will still uses this register to determine when to exit a
 844          * loop, so we need to initialize the counter to 0, set the increment
 845          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 846          * is the maximum value allowed.  This gives us a maximum of 4096
 847          * iterations for our loops, but hopefully our break instruction will
 848          * execute before some time before the 4096th iteration.
 849          */
 850         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 851 }
 852
 853 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 854 {
 855         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 856         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 857         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 858 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 859         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 860         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 861         ctx->b.b.launch_grid = evergreen_launch_grid;
 862
 863 }
 864
 865 struct pipe_resource *r600_compute_global_buffer_create(
 866         struct pipe_screen *screen,
 867         const struct pipe_resource *templ)
 868 {
 869         struct r600_resource_global* result = NULL;
 870         struct r600_screen* rscreen = NULL;
 871         int size_in_dw = 0;
 872
 873         assert(templ->target == PIPE_BUFFER);
 874         assert(templ->bind & PIPE_BIND_GLOBAL);
 875         assert(templ->array_size == 1 || templ->array_size == 0);
 876         assert(templ->depth0 == 1 || templ->depth0 == 0);
 877         assert(templ->height0 == 1 || templ->height0 == 0);
 878
 879         result = (struct r600_resource_global*)
 880         CALLOC(sizeof(struct r600_resource_global), 1);
 881         rscreen = (struct r600_screen*)screen;
 882
 883         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 884         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 885                         templ->array_size);
 886
 887         result->base.b.vtbl = &r600_global_buffer_vtbl;
 888         result->base.b.b = *templ;
 889         result->base.b.b.screen = screen;
 890         pipe_reference_init(&result->base.b.b.reference, 1);
 891
 892         size_in_dw = (templ->width0+3) / 4;
 893
 894         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 895
 896         if (result->chunk == NULL)
 897         {
 898                 free(result);
 899                 return NULL;
 900         }
 901
 902         return &result->base.b.b;
 903 }
 904
 905 void r600_compute_global_buffer_destroy(
 906         struct pipe_screen *screen,
 907         struct pipe_resource *res)
 908 {
 909         struct r600_resource_global* buffer = NULL;
 910         struct r600_screen* rscreen = NULL;
 911
 912         assert(res->target == PIPE_BUFFER);
 913         assert(res->bind & PIPE_BIND_GLOBAL);
 914
 915         buffer = (struct r600_resource_global*)res;
 916         rscreen = (struct r600_screen*)screen;
 917
 918         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 919
 920         buffer->chunk = NULL;
 921         free(res);
 922 }
 923
 924 void *r600_compute_global_transfer_map(
 925         struct pipe_context *ctx_,
 926         struct pipe_resource *resource,
 927         unsigned level,
 928         unsigned usage,
 929         const struct pipe_box *box,
 930         struct pipe_transfer **ptransfer)
 931 {
 932         struct r600_context *rctx = (struct r600_context*)ctx_;
 933         struct compute_memory_pool *pool = rctx->screen->global_pool;
 934         struct r600_resource_global* buffer =
 935                 (struct r600_resource_global*)resource;
 936
 937         struct compute_memory_item *item = buffer->chunk;
 938         struct pipe_resource *dst = NULL;
 939         unsigned offset = box->x;
 940
 941         if (is_item_in_pool(item)) {
 942                 compute_memory_demote_item(pool, item, ctx_);
 943         }
 944         else {
 945                 if (item->real_buffer == NULL) {
 946                         item->real_buffer =
 947                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 948                 }
 949         }
 950
 951         dst = (struct pipe_resource*)item->real_buffer;
 952
 953         if (usage & PIPE_TRANSFER_READ)
 954                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 955
 956         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 957                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 958                         "width = %u, height = %u, depth = %u)\n", level, usage,
 959                         box->x, box->y, box->z, box->width, box->height,
 960                         box->depth);
 961         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 962                 "%u (box.x)\n", item->id, box->x);
 963
 964
 965         assert(resource->target == PIPE_BUFFER);
 966         assert(resource->bind & PIPE_BIND_GLOBAL);
 967         assert(box->x >= 0);
 968         assert(box->y == 0);
 969         assert(box->z == 0);
 970
 971         ///TODO: do it better, mapping is not possible if the pool is too big
 972         return pipe_buffer_map_range(ctx_, dst,
 973                         offset, box->width, usage, ptransfer);
 974 }
 975
 976 void r600_compute_global_transfer_unmap(
 977         struct pipe_context *ctx_,
 978         struct pipe_transfer* transfer)
 979 {
 980         /* struct r600_resource_global are not real resources, they just map
 981          * to an offset within the compute memory pool.  The function
 982          * r600_compute_global_transfer_map() maps the memory pool
 983          * resource rather than the struct r600_resource_global passed to
 984          * it as an argument and then initalizes ptransfer->resource with
 985          * the memory pool resource (via pipe_buffer_map_range).
 986          * When transfer_unmap is called it uses the memory pool's
 987          * vtable which calls r600_buffer_transfer_map() rather than
 988          * this function.
 989          */
 990         assert (!"This function should not be called");
 991 }
 992
 993 void r600_compute_global_transfer_flush_region(
 994         struct pipe_context *ctx_,
 995         struct pipe_transfer *transfer,
 996         const struct pipe_box *box)
 997 {
 998         assert(0 && "TODO");
 999 }
1000
1001 void r600_compute_global_transfer_inline_write(
1002         struct pipe_context *pipe,
1003         struct pipe_resource *resource,
1004         unsigned level,
1005         unsigned usage,
1006         const struct pipe_box *box,
1007         const void *data,
1008         unsigned stride,
1009         unsigned layer_stride)
1010 {
1011         assert(0 && "TODO");
1012 }