src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_resource.h"
  43 #include "r600_shader.h"
  44 #include "r600_pipe.h"
  45 #include "r600_formats.h"
  46 #include "evergreen_compute.h"
  47 #include "evergreen_compute_internal.h"
  48 #include "compute_memory_pool.h"
  49 #include "sb/sb_public.h"
  50 #ifdef HAVE_OPENCL
  51 #include "radeon_llvm_util.h"
  52 #endif
  53
  54 /**
  55 RAT0 is for global binding write
  56 VTX1 is for global binding read
  57
  58 for wrting images RAT1...
  59 for reading images TEX2...
  60   TEX2-RAT1 is paired
  61
  62 TEX2... consumes the same fetch resources, that VTX2... would consume
  63
  64 CONST0 and VTX0 is for parameters
  65   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  66   also constant cached
  67   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  68   the constant cache can handle
  69
  70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  72 we should reserve another one too.=> 10 image binding for writing max.
  73
  74 from Nvidia OpenCL:
  75   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  76   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  77
  78 so 10 for writing is enough. 176 is the max for reading according to the docs
  79
  80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  81 writable images will consume TEX slots, VTX slots too because of linear indexing
  82
  83 */
  84
  85 struct r600_resource* r600_compute_buffer_alloc_vram(
  86        struct r600_screen *screen,
  87        unsigned size)
  88 {
  89         struct pipe_resource * buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create(
  93                 (struct pipe_screen*) screen,
  94                 PIPE_BIND_CUSTOM,
  95                 PIPE_USAGE_IMMUTABLE,
  96                 size);
  97
  98         return (struct r600_resource *)buffer;
  99 }
 100
 101
 102 static void evergreen_set_rat(
 103         struct r600_pipe_compute *pipe,
 104         int id,
 105         struct r600_resource* bo,
 106         int start,
 107         int size)
 108 {
 109         struct pipe_surface rat_templ;
 110         struct r600_surface *surf = NULL;
 111         struct r600_context *rctx = NULL;
 112
 113         assert(id < 12);
 114         assert((size & 3) == 0);
 115         assert((start & 0xFF) == 0);
 116
 117         rctx = pipe->ctx;
 118
 119         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 120
 121         /* Create the RAT surface */
 122         memset(&rat_templ, 0, sizeof(rat_templ));
 123         rat_templ.format = PIPE_FORMAT_R32_UINT;
 124         rat_templ.u.tex.level = 0;
 125         rat_templ.u.tex.first_layer = 0;
 126         rat_templ.u.tex.last_layer = 0;
 127
 128         /* Add the RAT the list of color buffers */
 129         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->context.create_surface(
 130                 (struct pipe_context *)pipe->ctx,
 131                 (struct pipe_resource *)bo, &rat_templ);
 132
 133         /* Update the number of color buffers */
 134         pipe->ctx->framebuffer.state.nr_cbufs =
 135                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 136
 137         /* Update the cb_target_mask
 138          * XXX: I think this is a potential spot for bugs once we start doing
 139          * GL interop.  cb_target_mask may be modified in the 3D sections
 140          * of this driver. */
 141         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 142
 143         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 144         evergreen_init_color_surface_rat(rctx, surf);
 145 }
 146
 147 static void evergreen_cs_set_vertex_buffer(
 148         struct r600_context * rctx,
 149         unsigned vb_index,
 150         unsigned offset,
 151         struct pipe_resource * buffer)
 152 {
 153         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 154         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 155         vb->stride = 1;
 156         vb->buffer_offset = offset;
 157         vb->buffer = buffer;
 158         vb->user_buffer = NULL;
 159
 160         /* The vertex instructions in the compute shaders use the texture cache,
 161          * so we need to invalidate it. */
 162         rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
 163         state->enabled_mask |= 1 << vb_index;
 164         state->dirty_mask |= 1 << vb_index;
 165         state->atom.dirty = true;
 166 }
 167
 168 static void evergreen_cs_set_constant_buffer(
 169         struct r600_context * rctx,
 170         unsigned cb_index,
 171         unsigned offset,
 172         unsigned size,
 173         struct pipe_resource * buffer)
 174 {
 175         struct pipe_constant_buffer cb;
 176         cb.buffer_size = size;
 177         cb.buffer_offset = offset;
 178         cb.buffer = buffer;
 179         cb.user_buffer = NULL;
 180
 181         rctx->context.set_constant_buffer(&rctx->context, PIPE_SHADER_COMPUTE, cb_index, &cb);
 182 }
 183
 184 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 185 {
 186         u_default_resource_get_handle, /* get_handle */
 187         r600_compute_global_buffer_destroy, /* resource_destroy */
 188         r600_compute_global_transfer_map, /* transfer_map */
 189         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 190         r600_compute_global_transfer_unmap, /* transfer_unmap */
 191         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 192 };
 193
 194
 195 void *evergreen_create_compute_state(
 196         struct pipe_context *ctx_,
 197         const const struct pipe_compute_state *cso)
 198 {
 199         struct r600_context *ctx = (struct r600_context *)ctx_;
 200         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 201
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const unsigned char * code;
 205         unsigned i;
 206
 207         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 208
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211 #endif
 212
 213         shader->ctx = (struct r600_context*)ctx;
 214         shader->local_size = cso->req_local_mem; ///TODO: assert it
 215         shader->private_size = cso->req_private_mem;
 216         shader->input_size = cso->req_input_mem;
 217
 218 #ifdef HAVE_OPENCL
 219         shader->num_kernels = radeon_llvm_get_num_kernels(code, header->num_bytes);
 220         shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
 221
 222         for (i = 0; i < shader->num_kernels; i++) {
 223                 struct r600_kernel *kernel = &shader->kernels[i];
 224                 kernel->llvm_module = radeon_llvm_get_kernel_module(i, code,
 225                                                         header->num_bytes);
 226         }
 227 #endif
 228         return shader;
 229 }
 230
 231 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 232 {
 233         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 234
 235         free(shader);
 236 }
 237
 238 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 239 {
 240         struct r600_context *ctx = (struct r600_context *)ctx_;
 241
 242         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 243
 244         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 245 }
 246
 247 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 248  * kernel parameters there are inplicit parameters that need to be stored
 249  * in the vertex buffer as well.  Here is how these parameters are organized in
 250  * the buffer:
 251  *
 252  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 253  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 254  * DWORDS 6-8: Number of work items within each work group in each dimension
 255  *             (x,y,z)
 256  * DWORDS 9+ : Kernel parameters
 257  */
 258 void evergreen_compute_upload_input(
 259         struct pipe_context *ctx_,
 260         const uint *block_layout,
 261         const uint *grid_layout,
 262         const void *input)
 263 {
 264         struct r600_context *ctx = (struct r600_context *)ctx_;
 265         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 266         int i;
 267         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 268          * parameters.
 269          */
 270         unsigned input_size = shader->input_size + 36;
 271         uint32_t * num_work_groups_start;
 272         uint32_t * global_size_start;
 273         uint32_t * local_size_start;
 274         uint32_t * kernel_parameters_start;
 275         struct pipe_box box;
 276         struct pipe_transfer *transfer = NULL;
 277
 278         if (shader->input_size == 0) {
 279                 return;
 280         }
 281
 282         if (!shader->kernel_param) {
 283                 /* Add space for the grid dimensions */
 284                 shader->kernel_param = (struct r600_resource *)
 285                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 286                                         PIPE_USAGE_IMMUTABLE, input_size);
 287         }
 288
 289         u_box_1d(0, input_size, &box);
 290         num_work_groups_start = ctx_->transfer_map(ctx_,
 291                         (struct pipe_resource*)shader->kernel_param,
 292                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 293                         &box, &transfer);
 294         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 295         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 296         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 297
 298         /* Copy the work group size */
 299         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 300
 301         /* Copy the global size */
 302         for (i = 0; i < 3; i++) {
 303                 global_size_start[i] = grid_layout[i] * block_layout[i];
 304         }
 305
 306         /* Copy the local dimensions */
 307         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 308
 309         /* Copy the kernel inputs */
 310         memcpy(kernel_parameters_start, input, shader->input_size);
 311
 312         for (i = 0; i < (input_size / 4); i++) {
 313                 COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
 314                         ((unsigned*)num_work_groups_start)[i]);
 315         }
 316
 317         ctx_->transfer_unmap(ctx_, transfer);
 318
 319         /* ID=0 is reserved for the parameters */
 320         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 321                         (struct pipe_resource*)shader->kernel_param);
 322 }
 323
 324 static void evergreen_emit_direct_dispatch(
 325                 struct r600_context *rctx,
 326                 const uint *block_layout, const uint *grid_layout)
 327 {
 328         int i;
 329         struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
 330         unsigned num_waves;
 331         unsigned num_pipes = rctx->screen->info.r600_max_pipes;
 332         unsigned wave_divisor = (16 * num_pipes);
 333         int group_size = 1;
 334         int grid_size = 1;
 335         /* XXX: Enable lds and get size from cs_shader_state */
 336         unsigned lds_size = 0;
 337
 338         /* Calculate group_size/grid_size */
 339         for (i = 0; i < 3; i++) {
 340                 group_size *= block_layout[i];
 341         }
 342
 343         for (i = 0; i < 3; i++) {
 344                 grid_size *= grid_layout[i];
 345         }
 346
 347         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 348         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 349                         wave_divisor - 1) / wave_divisor;
 350
 351         COMPUTE_DBG(rctx->screen, "Using %u pipes, there are %u wavefronts per thread block\n",
 352                                                         num_pipes, num_waves);
 353
 354         /* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
 355          * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
 356          * We may need to allocat the entire LDS space for Compute Shaders.
 357          *
 358          * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
 359          * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
 360          */
 361
 362         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 363
 364         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 365         r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 366         r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 367         r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 368
 369         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 370                                                                 group_size);
 371
 372         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 373         r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 374         r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 375         r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 376
 377         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 378                                         lds_size | (num_waves << 14));
 379
 380         /* Dispatch packet */
 381         r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 382         r600_write_value(cs, grid_layout[0]);
 383         r600_write_value(cs, grid_layout[1]);
 384         r600_write_value(cs, grid_layout[2]);
 385         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 386         r600_write_value(cs, 1);
 387 }
 388
 389 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 390                 const uint *grid_layout)
 391 {
 392         struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
 393         unsigned flush_flags = 0;
 394         int i;
 395
 396         /* make sure that the gfx ring is only one active */
 397         if (ctx->rings.dma.cs) {
 398                 ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
 399         }
 400
 401         /* Initialize all the compute-related registers.
 402          *
 403          * See evergreen_init_atom_start_compute_cs() in this file for the list
 404          * of registers initialized by the start_compute_cs_cmd atom.
 405          */
 406         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 407
 408         ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 409         r600_flush_emit(ctx);
 410
 411         /* Emit colorbuffers. */
 412         for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) {
 413                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 414                 unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx,
 415                                                        (struct r600_resource*)cb->base.texture,
 416                                                        RADEON_USAGE_READWRITE);
 417
 418                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 419                 r600_write_value(cs, cb->cb_color_base);        /* R_028C60_CB_COLOR0_BASE */
 420                 r600_write_value(cs, cb->cb_color_pitch);       /* R_028C64_CB_COLOR0_PITCH */
 421                 r600_write_value(cs, cb->cb_color_slice);       /* R_028C68_CB_COLOR0_SLICE */
 422                 r600_write_value(cs, cb->cb_color_view);        /* R_028C6C_CB_COLOR0_VIEW */
 423                 r600_write_value(cs, cb->cb_color_info);        /* R_028C70_CB_COLOR0_INFO */
 424                 r600_write_value(cs, cb->cb_color_attrib);      /* R_028C74_CB_COLOR0_ATTRIB */
 425                 r600_write_value(cs, cb->cb_color_dim);         /* R_028C78_CB_COLOR0_DIM */
 426
 427                 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 428                 r600_write_value(cs, reloc);
 429
 430                 if (!ctx->keep_tiling_flags) {
 431                         r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 432                         r600_write_value(cs, reloc);
 433                 }
 434
 435                 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 436                 r600_write_value(cs, reloc);
 437         }
 438
 439         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 440         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 441                                         ctx->compute_cb_target_mask);
 442
 443
 444         /* Emit vertex buffer state */
 445         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 446         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 447
 448         /* Emit constant buffer state */
 449         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 450
 451         /* Emit compute shader state */
 452         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 453
 454         /* Emit dispatch state and dispatch packet */
 455         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 456
 457         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 458          */
 459         ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
 460         r600_flush_emit(ctx);
 461
 462 #if 0
 463         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 464         for (i = 0; i < cs->cdw; i++) {
 465                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 466         }
 467 #endif
 468
 469         flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
 470         if (ctx->keep_tiling_flags) {
 471                 flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 472         }
 473
 474         ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags, ctx->screen->cs_count++);
 475
 476         ctx->flags = 0;
 477
 478         COMPUTE_DBG(ctx->screen, "shader started\n");
 479 }
 480
 481
 482 /**
 483  * Emit function for r600_cs_shader_state atom
 484  */
 485 void evergreen_emit_cs_shader(
 486                 struct r600_context *rctx,
 487                 struct r600_atom *atom)
 488 {
 489         struct r600_cs_shader_state *state =
 490                                         (struct r600_cs_shader_state*)atom;
 491         struct r600_pipe_compute *shader = state->shader;
 492         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 493         struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
 494         uint64_t va;
 495
 496         va = r600_resource_va(&rctx->screen->screen, &kernel->code_bo->b.b);
 497
 498         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 499         r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 500         r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 501                         S_0288D4_NUM_GPRS(kernel->bc.ngpr)
 502                         | S_0288D4_STACK_SIZE(kernel->bc.nstack));
 503         r600_write_value(cs, 0);        /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 504
 505         r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
 506         r600_write_value(cs, r600_context_bo_reloc(rctx, &rctx->rings.gfx,
 507                                                         kernel->code_bo, RADEON_USAGE_READ));
 508
 509         rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
 510 }
 511
 512 static void evergreen_launch_grid(
 513                 struct pipe_context *ctx_,
 514                 const uint *block_layout, const uint *grid_layout,
 515                 uint32_t pc, const void *input)
 516 {
 517         struct r600_context *ctx = (struct r600_context *)ctx_;
 518
 519 #ifdef HAVE_OPENCL
 520         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 521
 522         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 523         if (!shader->kernels[pc].code_bo) {
 524                 void *p;
 525                 struct r600_kernel *kernel = &shader->kernels[pc];
 526                 struct r600_bytecode *bc = &kernel->bc;
 527                 LLVMModuleRef mod = kernel->llvm_module;
 528                 boolean use_kill = false;
 529                 bool dump = (ctx->screen->debug_flags & DBG_CS) != 0;
 530                 unsigned use_sb = ctx->screen->debug_flags & DBG_SB_CS;
 531                 unsigned sb_disasm = use_sb ||
 532                         (ctx->screen->debug_flags & DBG_SB_DISASM);
 533
 534                 r600_bytecode_init(bc, ctx->chip_class, ctx->family,
 535                            ctx->screen->has_compressed_msaa_texturing);
 536                 bc->type = TGSI_PROCESSOR_COMPUTE;
 537                 bc->isa = ctx->isa;
 538                 r600_llvm_compile(mod, ctx->family, bc, &use_kill, dump);
 539
 540                 if (dump && !sb_disasm) {
 541                         r600_bytecode_disasm(bc);
 542                 } else if ((dump && sb_disasm) || use_sb) {
 543                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 544                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 545                 }
 546
 547                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 548                                                         kernel->bc.ndw * 4);
 549                 p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
 550                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 551                 ctx->ws->buffer_unmap(kernel->code_bo->cs_buf);
 552         }
 553 #endif
 554
 555         ctx->cs_shader_state.kernel_index = pc;
 556         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 557         compute_emit_cs(ctx, block_layout, grid_layout);
 558 }
 559
 560 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 561                 unsigned start, unsigned count,
 562                 struct pipe_surface ** surfaces)
 563 {
 564         struct r600_context *ctx = (struct r600_context *)ctx_;
 565         struct r600_surface **resources = (struct r600_surface **)surfaces;
 566
 567         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 568                         start, count);
 569
 570         for (int i = 0; i < count; i++) {
 571                 /* The First two vertex buffers are reserved for parameters and
 572                  * global buffers. */
 573                 unsigned vtx_id = 2 + i;
 574                 if (resources[i]) {
 575                         struct r600_resource_global *buffer =
 576                                 (struct r600_resource_global*)
 577                                 resources[i]->base.texture;
 578                         if (resources[i]->base.writable) {
 579                                 assert(i+1 < 12);
 580
 581                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 582                                 (struct r600_resource *)resources[i]->base.texture,
 583                                 buffer->chunk->start_in_dw*4,
 584                                 resources[i]->base.texture->width0);
 585                         }
 586
 587                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 588                                         buffer->chunk->start_in_dw * 4,
 589                                         resources[i]->base.texture);
 590                 }
 591         }
 592 }
 593
 594 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 595                 unsigned start_slot, unsigned count,
 596                 struct pipe_sampler_view **views)
 597 {
 598         struct r600_pipe_sampler_view **resource =
 599                 (struct r600_pipe_sampler_view **)views;
 600
 601         for (int i = 0; i < count; i++) {
 602                 if (resource[i]) {
 603                         assert(i+1 < 12);
 604                         /* XXX: Implement */
 605                         assert(!"Compute samplers not implemented.");
 606                         ///FETCH0 = VTX0 (param buffer),
 607                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 608                 }
 609         }
 610 }
 611
 612 static void evergreen_bind_compute_sampler_states(
 613         struct pipe_context *ctx_,
 614         unsigned start_slot,
 615         unsigned num_samplers,
 616         void **samplers_)
 617 {
 618         struct compute_sampler_state ** samplers =
 619                 (struct compute_sampler_state **)samplers_;
 620
 621         for (int i = 0; i < num_samplers; i++) {
 622                 if (samplers[i]) {
 623                         /* XXX: Implement */
 624                         assert(!"Compute samplers not implemented.");
 625                 }
 626         }
 627 }
 628
 629 static void evergreen_set_global_binding(
 630         struct pipe_context *ctx_, unsigned first, unsigned n,
 631         struct pipe_resource **resources,
 632         uint32_t **handles)
 633 {
 634         struct r600_context *ctx = (struct r600_context *)ctx_;
 635         struct compute_memory_pool *pool = ctx->screen->global_pool;
 636         struct r600_resource_global **buffers =
 637                 (struct r600_resource_global **)resources;
 638
 639         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 640                         first, n);
 641
 642         if (!resources) {
 643                 /* XXX: Unset */
 644                 return;
 645         }
 646
 647         compute_memory_finalize_pending(pool, ctx_);
 648
 649         for (int i = 0; i < n; i++)
 650         {
 651                 assert(resources[i]->target == PIPE_BUFFER);
 652                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 653
 654                 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
 655         }
 656
 657         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 658         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 659                                 (struct pipe_resource*)pool->bo);
 660 }
 661
 662 /**
 663  * This function initializes all the compute specific registers that need to
 664  * be initialized for each compute command stream.  Registers that are common
 665  * to both compute and 3D will be initialized at the beginning of each compute
 666  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 667  * packet requires that the shader type bit be set, we must initialize all
 668  * context registers needed for compute in this function.  The registers
 669  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 670  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 671  * on the GPU family.
 672  */
 673 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 674 {
 675         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 676         int num_threads;
 677         int num_stack_entries;
 678
 679         /* since all required registers are initialised in the
 680          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 681          */
 682         r600_init_command_buffer(cb, 256);
 683         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 684
 685         /* This must be first. */
 686         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 687         r600_store_value(cb, 0x80000000);
 688         r600_store_value(cb, 0x80000000);
 689
 690         /* We're setting config registers here. */
 691         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 692         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 693
 694         switch (ctx->family) {
 695         case CHIP_CEDAR:
 696         default:
 697                 num_threads = 128;
 698                 num_stack_entries = 256;
 699                 break;
 700         case CHIP_REDWOOD:
 701                 num_threads = 128;
 702                 num_stack_entries = 256;
 703                 break;
 704         case CHIP_JUNIPER:
 705                 num_threads = 128;
 706                 num_stack_entries = 512;
 707                 break;
 708         case CHIP_CYPRESS:
 709         case CHIP_HEMLOCK:
 710                 num_threads = 128;
 711                 num_stack_entries = 512;
 712                 break;
 713         case CHIP_PALM:
 714                 num_threads = 128;
 715                 num_stack_entries = 256;
 716                 break;
 717         case CHIP_SUMO:
 718                 num_threads = 128;
 719                 num_stack_entries = 256;
 720                 break;
 721         case CHIP_SUMO2:
 722                 num_threads = 128;
 723                 num_stack_entries = 512;
 724                 break;
 725         case CHIP_BARTS:
 726                 num_threads = 128;
 727                 num_stack_entries = 512;
 728                 break;
 729         case CHIP_TURKS:
 730                 num_threads = 128;
 731                 num_stack_entries = 256;
 732                 break;
 733         case CHIP_CAICOS:
 734                 num_threads = 128;
 735                 num_stack_entries = 256;
 736                 break;
 737         }
 738
 739         /* Config Registers */
 740         if (ctx->chip_class < CAYMAN)
 741                 evergreen_init_common_regs(cb, ctx->chip_class, ctx->family,
 742                                            ctx->screen->info.drm_minor);
 743         else
 744                 cayman_init_common_regs(cb, ctx->chip_class, ctx->family,
 745                                         ctx->screen->info.drm_minor);
 746
 747         /* The primitive type always needs to be POINTLIST for compute. */
 748         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 749                                                 V_008958_DI_PT_POINTLIST);
 750
 751         if (ctx->chip_class < CAYMAN) {
 752
 753                 /* These registers control which simds can be used by each stage.
 754                  * The default for these registers is 0xffffffff, which means
 755                  * all simds are available for each stage.  It's possible we may
 756                  * want to play around with these in the future, but for now
 757                  * the default value is fine.
 758                  *
 759                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 760                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 761                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 762                  */
 763
 764                 /* XXX: We may need to adjust the thread and stack resouce
 765                  * values for 3D/compute interop */
 766
 767                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 768
 769                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 770                  * Set the number of threads used by the PS/VS/GS/ES stage to
 771                  * 0.
 772                  */
 773                 r600_store_value(cb, 0);
 774
 775                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 776                  * Set the number of threads used by the CS (aka LS) stage to
 777                  * the maximum number of threads and set the number of threads
 778                  * for the HS stage to 0. */
 779                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 780
 781                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 782                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 783                 r600_store_value(cb, 0);
 784
 785                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 786                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 787                 r600_store_value(cb, 0);
 788
 789                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 790                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 791                  * set it to the maximum value for the CS (aka LS) stage. */
 792                 r600_store_value(cb,
 793                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 794         }
 795
 796         /* Context Registers */
 797
 798         if (ctx->chip_class < CAYMAN) {
 799                 /* workaround for hw issues with dyn gpr - must set all limits
 800                  * to 240 instead of 0, 0x1e == 240 / 8
 801                  */
 802                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 803                                 S_028838_PS_GPRS(0x1e) |
 804                                 S_028838_VS_GPRS(0x1e) |
 805                                 S_028838_GS_GPRS(0x1e) |
 806                                 S_028838_ES_GPRS(0x1e) |
 807                                 S_028838_HS_GPRS(0x1e) |
 808                                 S_028838_LS_GPRS(0x1e));
 809         }
 810
 811         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 812         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 813                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 814
 815         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 816
 817         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 818                                                 S_0286E8_TID_IN_GROUP_ENA
 819                                                 | S_0286E8_TGID_ENA
 820                                                 | S_0286E8_DISABLE_INDEX_PACK)
 821                                                 ;
 822
 823         /* The LOOP_CONST registers are an optimizations for loops that allows
 824          * you to store the initial counter, increment value, and maximum
 825          * counter value in a register so that hardware can calculate the
 826          * correct number of iterations for the loop, so that you don't need
 827          * to have the loop counter in your shader code.  We don't currently use
 828          * this optimization, so we must keep track of the counter in the
 829          * shader and use a break instruction to exit loops.  However, the
 830          * hardware will still uses this register to determine when to exit a
 831          * loop, so we need to initialize the counter to 0, set the increment
 832          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 833          * is the maximum value allowed.  This gives us a maximum of 4096
 834          * iterations for our loops, but hopefully our break instruction will
 835          * execute before some time before the 4096th iteration.
 836          */
 837         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 838 }
 839
 840 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 841 {
 842         ctx->context.create_compute_state = evergreen_create_compute_state;
 843         ctx->context.delete_compute_state = evergreen_delete_compute_state;
 844         ctx->context.bind_compute_state = evergreen_bind_compute_state;
 845 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 846         ctx->context.set_compute_resources = evergreen_set_compute_resources;
 847         ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
 848         ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
 849         ctx->context.set_global_binding = evergreen_set_global_binding;
 850         ctx->context.launch_grid = evergreen_launch_grid;
 851
 852         /* We always use at least one vertex buffer for parameters (id = 1)*/
 853         ctx->cs_vertex_buffer_state.enabled_mask =
 854         ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
 855 }
 856
 857
 858 struct pipe_resource *r600_compute_global_buffer_create(
 859         struct pipe_screen *screen,
 860         const struct pipe_resource *templ)
 861 {
 862         struct r600_resource_global* result = NULL;
 863         struct r600_screen* rscreen = NULL;
 864         int size_in_dw = 0;
 865
 866         assert(templ->target == PIPE_BUFFER);
 867         assert(templ->bind & PIPE_BIND_GLOBAL);
 868         assert(templ->array_size == 1 || templ->array_size == 0);
 869         assert(templ->depth0 == 1 || templ->depth0 == 0);
 870         assert(templ->height0 == 1 || templ->height0 == 0);
 871
 872         result = (struct r600_resource_global*)
 873         CALLOC(sizeof(struct r600_resource_global), 1);
 874         rscreen = (struct r600_screen*)screen;
 875
 876         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 877         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 878                         templ->array_size);
 879
 880         result->base.b.vtbl = &r600_global_buffer_vtbl;
 881         result->base.b.b.screen = screen;
 882         result->base.b.b = *templ;
 883         pipe_reference_init(&result->base.b.b.reference, 1);
 884
 885         size_in_dw = (templ->width0+3) / 4;
 886
 887         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 888
 889         if (result->chunk == NULL)
 890         {
 891                 free(result);
 892                 return NULL;
 893         }
 894
 895         return &result->base.b.b;
 896 }
 897
 898 void r600_compute_global_buffer_destroy(
 899         struct pipe_screen *screen,
 900         struct pipe_resource *res)
 901 {
 902         struct r600_resource_global* buffer = NULL;
 903         struct r600_screen* rscreen = NULL;
 904
 905         assert(res->target == PIPE_BUFFER);
 906         assert(res->bind & PIPE_BIND_GLOBAL);
 907
 908         buffer = (struct r600_resource_global*)res;
 909         rscreen = (struct r600_screen*)screen;
 910
 911         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 912
 913         buffer->chunk = NULL;
 914         free(res);
 915 }
 916
 917 void *r600_compute_global_transfer_map(
 918         struct pipe_context *ctx_,
 919         struct pipe_resource *resource,
 920         unsigned level,
 921         unsigned usage,
 922         const struct pipe_box *box,
 923         struct pipe_transfer **ptransfer)
 924 {
 925         struct r600_context *rctx = (struct r600_context*)ctx_;
 926         struct compute_memory_pool *pool = rctx->screen->global_pool;
 927         struct r600_resource_global* buffer =
 928                 (struct r600_resource_global*)resource;
 929
 930         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 931                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 932                         "width = %u, height = %u, depth = %u)\n", level, usage,
 933                         box->x, box->y, box->z, box->width, box->height,
 934                         box->depth);
 935         COMPUTE_DBG(rctx->screen, "Buffer: %u (buffer offset in global memory) "
 936                 "+ %u (box.x)\n", buffer->chunk->start_in_dw, box->x);
 937
 938
 939         compute_memory_finalize_pending(pool, ctx_);
 940
 941         assert(resource->target == PIPE_BUFFER);
 942         assert(resource->bind & PIPE_BIND_GLOBAL);
 943         assert(box->x >= 0);
 944         assert(box->y == 0);
 945         assert(box->z == 0);
 946
 947         ///TODO: do it better, mapping is not possible if the pool is too big
 948         return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
 949                         box->x + (buffer->chunk->start_in_dw * 4),
 950                         box->width, usage, ptransfer);
 951 }
 952
 953 void r600_compute_global_transfer_unmap(
 954         struct pipe_context *ctx_,
 955         struct pipe_transfer* transfer)
 956 {
 957         /* struct r600_resource_global are not real resources, they just map
 958          * to an offset within the compute memory pool.  The function
 959          * r600_compute_global_transfer_map() maps the memory pool
 960          * resource rather than the struct r600_resource_global passed to
 961          * it as an argument and then initalizes ptransfer->resource with
 962          * the memory pool resource (via pipe_buffer_map_range).
 963          * When transfer_unmap is called it uses the memory pool's
 964          * vtable which calls r600_buffer_transfer_map() rather than
 965          * this function.
 966          */
 967         assert (!"This function should not be called");
 968 }
 969
 970 void r600_compute_global_transfer_flush_region(
 971         struct pipe_context *ctx_,
 972         struct pipe_transfer *transfer,
 973         const struct pipe_box *box)
 974 {
 975         assert(0 && "TODO");
 976 }
 977
 978 void r600_compute_global_transfer_inline_write(
 979         struct pipe_context *pipe,
 980         struct pipe_resource *resource,
 981         unsigned level,
 982         unsigned usage,
 983         const struct pipe_box *box,
 984         const void *data,
 985         unsigned stride,
 986         unsigned layer_stride)
 987 {
 988         assert(0 && "TODO");
 989 }