src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         r600_mark_atom_dirty(rctx, &state->atom);
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195
 196 void *evergreen_create_compute_state(
 197         struct pipe_context *ctx_,
 198         const const struct pipe_compute_state *cso)
 199 {
 200         struct r600_context *ctx = (struct r600_context *)ctx_;
 201         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const char *code;
 205         void *p;
 206         boolean use_kill;
 207
 208         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211         radeon_shader_binary_init(&shader->binary);
 212         radeon_elf_read(code, header->num_bytes, &shader->binary);
 213         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 214
 215         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 216                                                         shader->bc.ndw * 4);
 217         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 218         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 219         ctx->b.ws->buffer_unmap(shader->code_bo->buf);
 220 #endif
 221
 222         shader->ctx = ctx;
 223         shader->local_size = cso->req_local_mem;
 224         shader->private_size = cso->req_private_mem;
 225         shader->input_size = cso->req_input_mem;
 226
 227         return shader;
 228 }
 229
 230 void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 231 {
 232         struct r600_context *ctx = (struct r600_context *)ctx_;
 233         COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
 234         struct r600_pipe_compute *shader = state;
 235
 236         if (!shader)
 237                 return;
 238
 239 #ifdef HAVE_OPENCL
 240         radeon_shader_binary_clean(&shader->binary);
 241         r600_destroy_shader(&shader->bc);
 242
 243         /* TODO destroy shader->code_bo, shader->const_bo
 244          * we'll need something like r600_buffer_free */
 245 #endif
 246         FREE(shader);
 247 }
 248
 249 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 250 {
 251         struct r600_context *ctx = (struct r600_context *)ctx_;
 252
 253         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 254
 255         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 256 }
 257
 258 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 259  * kernel parameters there are implicit parameters that need to be stored
 260  * in the vertex buffer as well.  Here is how these parameters are organized in
 261  * the buffer:
 262  *
 263  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 264  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 265  * DWORDS 6-8: Number of work items within each work group in each dimension
 266  *             (x,y,z)
 267  * DWORDS 9+ : Kernel parameters
 268  */
 269 void evergreen_compute_upload_input(
 270         struct pipe_context *ctx_,
 271         const uint *block_layout,
 272         const uint *grid_layout,
 273         const void *input)
 274 {
 275         struct r600_context *ctx = (struct r600_context *)ctx_;
 276         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 277         unsigned i;
 278         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 279          * parameters.
 280          */
 281         unsigned input_size = shader->input_size + 36;
 282         uint32_t * num_work_groups_start;
 283         uint32_t * global_size_start;
 284         uint32_t * local_size_start;
 285         uint32_t * kernel_parameters_start;
 286         struct pipe_box box;
 287         struct pipe_transfer *transfer = NULL;
 288
 289         if (shader->input_size == 0) {
 290                 return;
 291         }
 292
 293         if (!shader->kernel_param) {
 294                 /* Add space for the grid dimensions */
 295                 shader->kernel_param = (struct r600_resource *)
 296                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 297                                         PIPE_USAGE_IMMUTABLE, input_size);
 298         }
 299
 300         u_box_1d(0, input_size, &box);
 301         num_work_groups_start = ctx_->transfer_map(ctx_,
 302                         (struct pipe_resource*)shader->kernel_param,
 303                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 304                         &box, &transfer);
 305         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 306         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 307         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 308
 309         /* Copy the work group size */
 310         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 311
 312         /* Copy the global size */
 313         for (i = 0; i < 3; i++) {
 314                 global_size_start[i] = grid_layout[i] * block_layout[i];
 315         }
 316
 317         /* Copy the local dimensions */
 318         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 319
 320         /* Copy the kernel inputs */
 321         memcpy(kernel_parameters_start, input, shader->input_size);
 322
 323         for (i = 0; i < (input_size / 4); i++) {
 324                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 325                         ((unsigned*)num_work_groups_start)[i]);
 326         }
 327
 328         ctx_->transfer_unmap(ctx_, transfer);
 329
 330         /* ID=0 is reserved for the parameters */
 331         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 332                         (struct pipe_resource*)shader->kernel_param);
 333 }
 334
 335 static void evergreen_emit_direct_dispatch(
 336                 struct r600_context *rctx,
 337                 const uint *block_layout, const uint *grid_layout)
 338 {
 339         int i;
 340         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 341         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 342         unsigned num_waves;
 343         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 344         unsigned wave_divisor = (16 * num_pipes);
 345         int group_size = 1;
 346         int grid_size = 1;
 347         unsigned lds_size = shader->local_size / 4 +
 348                 shader->bc.nlds_dw;
 349
 350
 351         /* Calculate group_size/grid_size */
 352         for (i = 0; i < 3; i++) {
 353                 group_size *= block_layout[i];
 354         }
 355
 356         for (i = 0; i < 3; i++) {
 357                 grid_size *= grid_layout[i];
 358         }
 359
 360         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 361         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 362                         wave_divisor - 1) / wave_divisor;
 363
 364         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 365                                 "%u wavefronts per thread block, "
 366                                 "allocating %u dwords lds.\n",
 367                                 num_pipes, num_waves, lds_size);
 368
 369         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 370
 371         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 372         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 373         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 374         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 375
 376         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 377                                                                 group_size);
 378
 379         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 380         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 381         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 382         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 383
 384         if (rctx->b.chip_class < CAYMAN) {
 385                 assert(lds_size <= 8192);
 386         } else {
 387                 /* Cayman appears to have a slightly smaller limit, see the
 388                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 389                 assert(lds_size <= 8160);
 390         }
 391
 392         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 393                                         lds_size | (num_waves << 14));
 394
 395         /* Dispatch packet */
 396         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 397         radeon_emit(cs, grid_layout[0]);
 398         radeon_emit(cs, grid_layout[1]);
 399         radeon_emit(cs, grid_layout[2]);
 400         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 401         radeon_emit(cs, 1);
 402 }
 403
 404 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 405                 const uint *grid_layout)
 406 {
 407         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 408         unsigned i;
 409
 410         /* make sure that the gfx ring is only one active */
 411         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 412                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 413         }
 414
 415         /* Initialize all the compute-related registers.
 416          *
 417          * See evergreen_init_atom_start_compute_cs() in this file for the list
 418          * of registers initialized by the start_compute_cs_cmd atom.
 419          */
 420         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 421
 422         /* emit config state */
 423         if (ctx->b.chip_class == EVERGREEN)
 424                 r600_emit_atom(ctx, &ctx->config_state.atom);
 425
 426         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 427         r600_flush_emit(ctx);
 428
 429         /* Emit colorbuffers. */
 430         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 431         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 432                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 433                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 434                                                        (struct r600_resource*)cb->base.texture,
 435                                                        RADEON_USAGE_READWRITE,
 436                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 437
 438                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 439                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 440                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 441                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 442                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 443                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 444                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 445                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 446
 447                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 448                 radeon_emit(cs, reloc);
 449
 450                 if (!ctx->keep_tiling_flags) {
 451                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 452                         radeon_emit(cs, reloc);
 453                 }
 454
 455                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 456                 radeon_emit(cs, reloc);
 457         }
 458         if (ctx->keep_tiling_flags) {
 459                 for (; i < 8 ; i++) {
 460                         radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 461                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 462                 }
 463                 for (; i < 12; i++) {
 464                         radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 465                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 466                 }
 467         }
 468
 469         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 470         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 471                                         ctx->compute_cb_target_mask);
 472
 473
 474         /* Emit vertex buffer state */
 475         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 476         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 477
 478         /* Emit constant buffer state */
 479         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 480
 481         /* Emit sampler state */
 482         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 483
 484         /* Emit sampler view (texture resource) state */
 485         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 486
 487         /* Emit compute shader state */
 488         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 489
 490         /* Emit dispatch state and dispatch packet */
 491         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 492
 493         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 494          */
 495         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 496                       R600_CONTEXT_INV_VERTEX_CACHE |
 497                       R600_CONTEXT_INV_TEX_CACHE;
 498         r600_flush_emit(ctx);
 499         ctx->b.flags = 0;
 500
 501         if (ctx->b.chip_class >= CAYMAN) {
 502                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 503                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 504                 /* DEALLOC_STATE prevents the GPU from hanging when a
 505                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 506                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 507                  */
 508                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 509                 cs->buf[cs->cdw++] = 0;
 510         }
 511
 512 #if 0
 513         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 514         for (i = 0; i < cs->cdw; i++) {
 515                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 516         }
 517 #endif
 518
 519 }
 520
 521
 522 /**
 523  * Emit function for r600_cs_shader_state atom
 524  */
 525 void evergreen_emit_cs_shader(
 526                 struct r600_context *rctx,
 527                 struct r600_atom *atom)
 528 {
 529         struct r600_cs_shader_state *state =
 530                                         (struct r600_cs_shader_state*)atom;
 531         struct r600_pipe_compute *shader = state->shader;
 532         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 533         uint64_t va;
 534         struct r600_resource *code_bo;
 535         unsigned ngpr, nstack;
 536
 537         code_bo = shader->code_bo;
 538         va = shader->code_bo->gpu_address + state->pc;
 539         ngpr = shader->bc.ngpr;
 540         nstack = shader->bc.nstack;
 541
 542         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 543         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 544         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 545                         S_0288D4_NUM_GPRS(ngpr)
 546                         | S_0288D4_STACK_SIZE(nstack));
 547         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 548
 549         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 550         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 551                                               code_bo, RADEON_USAGE_READ,
 552                                               RADEON_PRIO_USER_SHADER));
 553 }
 554
 555 static void evergreen_launch_grid(
 556                 struct pipe_context *ctx_,
 557                 const uint *block_layout, const uint *grid_layout,
 558                 uint32_t pc, const void *input)
 559 {
 560         struct r600_context *ctx = (struct r600_context *)ctx_;
 561 #ifdef HAVE_OPENCL
 562         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 563         boolean use_kill;
 564
 565         ctx->cs_shader_state.pc = pc;
 566         /* Get the config information for this kernel. */
 567         r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
 568 #endif
 569
 570         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 571
 572
 573         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 574         compute_emit_cs(ctx, block_layout, grid_layout);
 575 }
 576
 577 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 578                 unsigned start, unsigned count,
 579                 struct pipe_surface ** surfaces)
 580 {
 581         struct r600_context *ctx = (struct r600_context *)ctx_;
 582         struct r600_surface **resources = (struct r600_surface **)surfaces;
 583
 584         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 585                         start, count);
 586
 587         for (unsigned i = 0; i < count; i++) {
 588                 /* The First two vertex buffers are reserved for parameters and
 589                  * global buffers. */
 590                 unsigned vtx_id = 2 + i;
 591                 if (resources[i]) {
 592                         struct r600_resource_global *buffer =
 593                                 (struct r600_resource_global*)
 594                                 resources[i]->base.texture;
 595                         if (resources[i]->base.writable) {
 596                                 assert(i+1 < 12);
 597
 598                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 599                                 (struct r600_resource *)resources[i]->base.texture,
 600                                 buffer->chunk->start_in_dw*4,
 601                                 resources[i]->base.texture->width0);
 602                         }
 603
 604                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 605                                         buffer->chunk->start_in_dw * 4,
 606                                         resources[i]->base.texture);
 607                 }
 608         }
 609 }
 610
 611 static void evergreen_set_global_binding(
 612         struct pipe_context *ctx_, unsigned first, unsigned n,
 613         struct pipe_resource **resources,
 614         uint32_t **handles)
 615 {
 616         struct r600_context *ctx = (struct r600_context *)ctx_;
 617         struct compute_memory_pool *pool = ctx->screen->global_pool;
 618         struct r600_resource_global **buffers =
 619                 (struct r600_resource_global **)resources;
 620         unsigned i;
 621
 622         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 623                         first, n);
 624
 625         if (!resources) {
 626                 /* XXX: Unset */
 627                 return;
 628         }
 629
 630         /* We mark these items for promotion to the pool if they
 631          * aren't already there */
 632         for (i = first; i < first + n; i++) {
 633                 struct compute_memory_item *item = buffers[i]->chunk;
 634
 635                 if (!is_item_in_pool(item))
 636                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 637         }
 638
 639         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 640                 /* XXX: Unset */
 641                 return;
 642         }
 643
 644         for (i = first; i < first + n; i++)
 645         {
 646                 uint32_t buffer_offset;
 647                 uint32_t handle;
 648                 assert(resources[i]->target == PIPE_BUFFER);
 649                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 650
 651                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 652                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 653
 654                 *(handles[i]) = util_cpu_to_le32(handle);
 655         }
 656
 657         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 658         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 659                                 (struct pipe_resource*)pool->bo);
 660 }
 661
 662 /**
 663  * This function initializes all the compute specific registers that need to
 664  * be initialized for each compute command stream.  Registers that are common
 665  * to both compute and 3D will be initialized at the beginning of each compute
 666  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 667  * packet requires that the shader type bit be set, we must initialize all
 668  * context registers needed for compute in this function.  The registers
 669  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 670  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 671  * on the GPU family.
 672  */
 673 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 674 {
 675         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 676         int num_threads;
 677         int num_stack_entries;
 678
 679         /* since all required registers are initialized in the
 680          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 681          */
 682         r600_init_command_buffer(cb, 256);
 683         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 684
 685         /* This must be first. */
 686         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 687         r600_store_value(cb, 0x80000000);
 688         r600_store_value(cb, 0x80000000);
 689
 690         /* We're setting config registers here. */
 691         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 692         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 693
 694         switch (ctx->b.family) {
 695         case CHIP_CEDAR:
 696         default:
 697                 num_threads = 128;
 698                 num_stack_entries = 256;
 699                 break;
 700         case CHIP_REDWOOD:
 701                 num_threads = 128;
 702                 num_stack_entries = 256;
 703                 break;
 704         case CHIP_JUNIPER:
 705                 num_threads = 128;
 706                 num_stack_entries = 512;
 707                 break;
 708         case CHIP_CYPRESS:
 709         case CHIP_HEMLOCK:
 710                 num_threads = 128;
 711                 num_stack_entries = 512;
 712                 break;
 713         case CHIP_PALM:
 714                 num_threads = 128;
 715                 num_stack_entries = 256;
 716                 break;
 717         case CHIP_SUMO:
 718                 num_threads = 128;
 719                 num_stack_entries = 256;
 720                 break;
 721         case CHIP_SUMO2:
 722                 num_threads = 128;
 723                 num_stack_entries = 512;
 724                 break;
 725         case CHIP_BARTS:
 726                 num_threads = 128;
 727                 num_stack_entries = 512;
 728                 break;
 729         case CHIP_TURKS:
 730                 num_threads = 128;
 731                 num_stack_entries = 256;
 732                 break;
 733         case CHIP_CAICOS:
 734                 num_threads = 128;
 735                 num_stack_entries = 256;
 736                 break;
 737         }
 738
 739         /* Config Registers */
 740         if (ctx->b.chip_class < CAYMAN)
 741                 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
 742                                            ctx->screen->b.info.drm_minor);
 743         else
 744                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 745                                         ctx->screen->b.info.drm_minor);
 746
 747         /* The primitive type always needs to be POINTLIST for compute. */
 748         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 749                                                 V_008958_DI_PT_POINTLIST);
 750
 751         if (ctx->b.chip_class < CAYMAN) {
 752
 753                 /* These registers control which simds can be used by each stage.
 754                  * The default for these registers is 0xffffffff, which means
 755                  * all simds are available for each stage.  It's possible we may
 756                  * want to play around with these in the future, but for now
 757                  * the default value is fine.
 758                  *
 759                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 760                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 761                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 762                  */
 763
 764                 /* XXX: We may need to adjust the thread and stack resource
 765                  * values for 3D/compute interop */
 766
 767                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 768
 769                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 770                  * Set the number of threads used by the PS/VS/GS/ES stage to
 771                  * 0.
 772                  */
 773                 r600_store_value(cb, 0);
 774
 775                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 776                  * Set the number of threads used by the CS (aka LS) stage to
 777                  * the maximum number of threads and set the number of threads
 778                  * for the HS stage to 0. */
 779                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 780
 781                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 782                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 783                 r600_store_value(cb, 0);
 784
 785                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 786                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 787                 r600_store_value(cb, 0);
 788
 789                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 790                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 791                  * set it to the maximum value for the CS (aka LS) stage. */
 792                 r600_store_value(cb,
 793                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 794         }
 795         /* Give the compute shader all the available LDS space.
 796          * NOTE: This only sets the maximum number of dwords that a compute
 797          * shader can allocate.  When a shader is executed, we still need to
 798          * allocate the appropriate amount of LDS dwords using the
 799          * CM_R_0288E8_SQ_LDS_ALLOC register.
 800          */
 801         if (ctx->b.chip_class < CAYMAN) {
 802                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 803                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 804         } else {
 805                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 806                         S_0286FC_NUM_PS_LDS(0) |
 807                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 808         }
 809
 810         /* Context Registers */
 811
 812         if (ctx->b.chip_class < CAYMAN) {
 813                 /* workaround for hw issues with dyn gpr - must set all limits
 814                  * to 240 instead of 0, 0x1e == 240 / 8
 815                  */
 816                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 817                                 S_028838_PS_GPRS(0x1e) |
 818                                 S_028838_VS_GPRS(0x1e) |
 819                                 S_028838_GS_GPRS(0x1e) |
 820                                 S_028838_ES_GPRS(0x1e) |
 821                                 S_028838_HS_GPRS(0x1e) |
 822                                 S_028838_LS_GPRS(0x1e));
 823         }
 824
 825         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 826         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 827                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 828
 829         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 830
 831         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 832                                                 S_0286E8_TID_IN_GROUP_ENA
 833                                                 | S_0286E8_TGID_ENA
 834                                                 | S_0286E8_DISABLE_INDEX_PACK)
 835                                                 ;
 836
 837         /* The LOOP_CONST registers are an optimizations for loops that allows
 838          * you to store the initial counter, increment value, and maximum
 839          * counter value in a register so that hardware can calculate the
 840          * correct number of iterations for the loop, so that you don't need
 841          * to have the loop counter in your shader code.  We don't currently use
 842          * this optimization, so we must keep track of the counter in the
 843          * shader and use a break instruction to exit loops.  However, the
 844          * hardware will still uses this register to determine when to exit a
 845          * loop, so we need to initialize the counter to 0, set the increment
 846          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 847          * is the maximum value allowed.  This gives us a maximum of 4096
 848          * iterations for our loops, but hopefully our break instruction will
 849          * execute before some time before the 4096th iteration.
 850          */
 851         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 852 }
 853
 854 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 855 {
 856         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 857         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 858         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 859 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 860         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 861         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 862         ctx->b.b.launch_grid = evergreen_launch_grid;
 863
 864 }
 865
 866 struct pipe_resource *r600_compute_global_buffer_create(
 867         struct pipe_screen *screen,
 868         const struct pipe_resource *templ)
 869 {
 870         struct r600_resource_global* result = NULL;
 871         struct r600_screen* rscreen = NULL;
 872         int size_in_dw = 0;
 873
 874         assert(templ->target == PIPE_BUFFER);
 875         assert(templ->bind & PIPE_BIND_GLOBAL);
 876         assert(templ->array_size == 1 || templ->array_size == 0);
 877         assert(templ->depth0 == 1 || templ->depth0 == 0);
 878         assert(templ->height0 == 1 || templ->height0 == 0);
 879
 880         result = (struct r600_resource_global*)
 881         CALLOC(sizeof(struct r600_resource_global), 1);
 882         rscreen = (struct r600_screen*)screen;
 883
 884         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 885         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 886                         templ->array_size);
 887
 888         result->base.b.vtbl = &r600_global_buffer_vtbl;
 889         result->base.b.b = *templ;
 890         result->base.b.b.screen = screen;
 891         pipe_reference_init(&result->base.b.b.reference, 1);
 892
 893         size_in_dw = (templ->width0+3) / 4;
 894
 895         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 896
 897         if (result->chunk == NULL)
 898         {
 899                 free(result);
 900                 return NULL;
 901         }
 902
 903         return &result->base.b.b;
 904 }
 905
 906 void r600_compute_global_buffer_destroy(
 907         struct pipe_screen *screen,
 908         struct pipe_resource *res)
 909 {
 910         struct r600_resource_global* buffer = NULL;
 911         struct r600_screen* rscreen = NULL;
 912
 913         assert(res->target == PIPE_BUFFER);
 914         assert(res->bind & PIPE_BIND_GLOBAL);
 915
 916         buffer = (struct r600_resource_global*)res;
 917         rscreen = (struct r600_screen*)screen;
 918
 919         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 920
 921         buffer->chunk = NULL;
 922         free(res);
 923 }
 924
 925 void *r600_compute_global_transfer_map(
 926         struct pipe_context *ctx_,
 927         struct pipe_resource *resource,
 928         unsigned level,
 929         unsigned usage,
 930         const struct pipe_box *box,
 931         struct pipe_transfer **ptransfer)
 932 {
 933         struct r600_context *rctx = (struct r600_context*)ctx_;
 934         struct compute_memory_pool *pool = rctx->screen->global_pool;
 935         struct r600_resource_global* buffer =
 936                 (struct r600_resource_global*)resource;
 937
 938         struct compute_memory_item *item = buffer->chunk;
 939         struct pipe_resource *dst = NULL;
 940         unsigned offset = box->x;
 941
 942         if (is_item_in_pool(item)) {
 943                 compute_memory_demote_item(pool, item, ctx_);
 944         }
 945         else {
 946                 if (item->real_buffer == NULL) {
 947                         item->real_buffer =
 948                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 949                 }
 950         }
 951
 952         dst = (struct pipe_resource*)item->real_buffer;
 953
 954         if (usage & PIPE_TRANSFER_READ)
 955                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 956
 957         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 958                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 959                         "width = %u, height = %u, depth = %u)\n", level, usage,
 960                         box->x, box->y, box->z, box->width, box->height,
 961                         box->depth);
 962         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 963                 "%u (box.x)\n", item->id, box->x);
 964
 965
 966         assert(resource->target == PIPE_BUFFER);
 967         assert(resource->bind & PIPE_BIND_GLOBAL);
 968         assert(box->x >= 0);
 969         assert(box->y == 0);
 970         assert(box->z == 0);
 971
 972         ///TODO: do it better, mapping is not possible if the pool is too big
 973         return pipe_buffer_map_range(ctx_, dst,
 974                         offset, box->width, usage, ptransfer);
 975 }
 976
 977 void r600_compute_global_transfer_unmap(
 978         struct pipe_context *ctx_,
 979         struct pipe_transfer* transfer)
 980 {
 981         /* struct r600_resource_global are not real resources, they just map
 982          * to an offset within the compute memory pool.  The function
 983          * r600_compute_global_transfer_map() maps the memory pool
 984          * resource rather than the struct r600_resource_global passed to
 985          * it as an argument and then initalizes ptransfer->resource with
 986          * the memory pool resource (via pipe_buffer_map_range).
 987          * When transfer_unmap is called it uses the memory pool's
 988          * vtable which calls r600_buffer_transfer_map() rather than
 989          * this function.
 990          */
 991         assert (!"This function should not be called");
 992 }
 993
 994 void r600_compute_global_transfer_flush_region(
 995         struct pipe_context *ctx_,
 996         struct pipe_transfer *transfer,
 997         const struct pipe_box *box)
 998 {
 999         assert(0 && "TODO");
1000 }
1001
1002 void r600_compute_global_transfer_inline_write(
1003         struct pipe_context *pipe,
1004         struct pipe_resource *resource,
1005         unsigned level,
1006         unsigned usage,
1007         const struct pipe_box *box,
1008         const void *data,
1009         unsigned stride,
1010         unsigned layer_stride)
1011 {
1012         assert(0 && "TODO");
1013 }