src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  87                                                      unsigned size)
  88 {
  89         struct pipe_resource *buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  93                                     PIPE_BIND_CUSTOM,
  94                                     PIPE_USAGE_IMMUTABLE,
  95                                     size);
  96
  97         return (struct r600_resource *)buffer;
  98 }
  99
 100
 101 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 102                               unsigned id,
 103                               struct r600_resource *bo,
 104                               int start,
 105                               int size)
 106 {
 107         struct pipe_surface rat_templ;
 108         struct r600_surface *surf = NULL;
 109         struct r600_context *rctx = NULL;
 110
 111         assert(id < 12);
 112         assert((size & 3) == 0);
 113         assert((start & 0xFF) == 0);
 114
 115         rctx = pipe->ctx;
 116
 117         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 118
 119         /* Create the RAT surface */
 120         memset(&rat_templ, 0, sizeof(rat_templ));
 121         rat_templ.format = PIPE_FORMAT_R32_UINT;
 122         rat_templ.u.tex.level = 0;
 123         rat_templ.u.tex.first_layer = 0;
 124         rat_templ.u.tex.last_layer = 0;
 125
 126         /* Add the RAT the list of color buffers */
 127         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 128                 (struct pipe_context *)pipe->ctx,
 129                 (struct pipe_resource *)bo, &rat_templ);
 130
 131         /* Update the number of color buffers */
 132         pipe->ctx->framebuffer.state.nr_cbufs =
 133                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 134
 135         /* Update the cb_target_mask
 136          * XXX: I think this is a potential spot for bugs once we start doing
 137          * GL interop.  cb_target_mask may be modified in the 3D sections
 138          * of this driver. */
 139         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 140
 141         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 142         evergreen_init_color_surface_rat(rctx, surf);
 143 }
 144
 145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 146                                            unsigned vb_index,
 147                                            unsigned offset,
 148                                            struct pipe_resource *buffer)
 149 {
 150         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 151         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 152         vb->stride = 1;
 153         vb->buffer_offset = offset;
 154         vb->buffer = buffer;
 155         vb->user_buffer = NULL;
 156
 157         /* The vertex instructions in the compute shaders use the texture cache,
 158          * so we need to invalidate it. */
 159         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 160         state->enabled_mask |= 1 << vb_index;
 161         state->dirty_mask |= 1 << vb_index;
 162         r600_mark_atom_dirty(rctx, &state->atom);
 163 }
 164
 165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 166                                              unsigned cb_index,
 167                                              unsigned offset,
 168                                              unsigned size,
 169                                              struct pipe_resource *buffer)
 170 {
 171         struct pipe_constant_buffer cb;
 172         cb.buffer_size = size;
 173         cb.buffer_offset = offset;
 174         cb.buffer = buffer;
 175         cb.user_buffer = NULL;
 176
 177         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 178 }
 179
 180 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 181 {
 182         u_default_resource_get_handle, /* get_handle */
 183         r600_compute_global_buffer_destroy, /* resource_destroy */
 184         r600_compute_global_transfer_map, /* transfer_map */
 185         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 186         r600_compute_global_transfer_unmap, /* transfer_unmap */
 187         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 188 };
 189
 190 /* We need to define these R600 registers here, because we can't include
 191  * evergreend.h and r600d.h.
 192  */
 193 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 194 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 195
 196 #ifdef HAVE_OPENCL
 197
 198 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
 199                                            struct r600_bytecode *bc,
 200                                            uint64_t symbol_offset,
 201                                            boolean *use_kill)
 202 {
 203        unsigned i;
 204        const unsigned char *config =
 205                radeon_shader_binary_config_start(binary, symbol_offset);
 206
 207        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 208                unsigned reg =
 209                        util_le32_to_cpu(*(uint32_t*)(config + i));
 210                unsigned value =
 211                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 212                switch (reg) {
 213                /* R600 / R700 */
 214                case R_028850_SQ_PGM_RESOURCES_PS:
 215                case R_028868_SQ_PGM_RESOURCES_VS:
 216                /* Evergreen / Northern Islands */
 217                case R_028844_SQ_PGM_RESOURCES_PS:
 218                case R_028860_SQ_PGM_RESOURCES_VS:
 219                case R_0288D4_SQ_PGM_RESOURCES_LS:
 220                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 221                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 222                        break;
 223                case R_02880C_DB_SHADER_CONTROL:
 224                        *use_kill = G_02880C_KILL_ENABLE(value);
 225                        break;
 226                case R_0288E8_SQ_LDS_ALLOC:
 227                        bc->nlds_dw = value;
 228                        break;
 229                }
 230        }
 231 }
 232
 233 static unsigned r600_create_shader(struct r600_bytecode *bc,
 234                                    const struct radeon_shader_binary *binary,
 235                                    boolean *use_kill)
 236
 237 {
 238         assert(binary->code_size % 4 == 0);
 239         bc->bytecode = CALLOC(1, binary->code_size);
 240         memcpy(bc->bytecode, binary->code, binary->code_size);
 241         bc->ndw = binary->code_size / 4;
 242
 243         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 244         return 0;
 245 }
 246
 247 #endif
 248
 249 static void r600_destroy_shader(struct r600_bytecode *bc)
 250 {
 251         FREE(bc->bytecode);
 252 }
 253
 254 void *evergreen_create_compute_state(struct pipe_context *ctx_,
 255                                      const const struct pipe_compute_state *cso)
 256 {
 257         struct r600_context *ctx = (struct r600_context *)ctx_;
 258         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 259 #ifdef HAVE_OPENCL
 260         const struct pipe_llvm_program_header *header;
 261         const char *code;
 262         void *p;
 263         boolean use_kill;
 264
 265         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 266         header = cso->prog;
 267         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 268         radeon_shader_binary_init(&shader->binary);
 269         radeon_elf_read(code, header->num_bytes, &shader->binary);
 270         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 271
 272         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 273                                                         shader->bc.ndw * 4);
 274         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 275         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 276         ctx->b.ws->buffer_unmap(shader->code_bo->buf);
 277 #endif
 278
 279         shader->ctx = ctx;
 280         shader->local_size = cso->req_local_mem;
 281         shader->private_size = cso->req_private_mem;
 282         shader->input_size = cso->req_input_mem;
 283
 284         return shader;
 285 }
 286
 287 void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
 288 {
 289         struct r600_context *ctx = (struct r600_context *)ctx_;
 290         struct r600_pipe_compute *shader = state;
 291
 292         COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
 293
 294         if (!shader)
 295                 return;
 296
 297         radeon_shader_binary_clean(&shader->binary);
 298         r600_destroy_shader(&shader->bc);
 299
 300         /* TODO destroy shader->code_bo, shader->const_bo
 301          * we'll need something like r600_buffer_free */
 302         FREE(shader);
 303 }
 304
 305 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 306 {
 307         struct r600_context *ctx = (struct r600_context *)ctx_;
 308
 309         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 310
 311         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 312 }
 313
 314 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 315  * kernel parameters there are implicit parameters that need to be stored
 316  * in the vertex buffer as well.  Here is how these parameters are organized in
 317  * the buffer:
 318  *
 319  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 320  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 321  * DWORDS 6-8: Number of work items within each work group in each dimension
 322  *             (x,y,z)
 323  * DWORDS 9+ : Kernel parameters
 324  */
 325 void evergreen_compute_upload_input(struct pipe_context *ctx_,
 326                                     const uint *block_layout,
 327                                     const uint *grid_layout,
 328                                     const void *input)
 329 {
 330         struct r600_context *ctx = (struct r600_context *)ctx_;
 331         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 332         unsigned i;
 333         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 334          * parameters.
 335          */
 336         unsigned input_size = shader->input_size + 36;
 337         uint32_t *num_work_groups_start;
 338         uint32_t *global_size_start;
 339         uint32_t *local_size_start;
 340         uint32_t *kernel_parameters_start;
 341         struct pipe_box box;
 342         struct pipe_transfer *transfer = NULL;
 343
 344         if (shader->input_size == 0) {
 345                 return;
 346         }
 347
 348         if (!shader->kernel_param) {
 349                 /* Add space for the grid dimensions */
 350                 shader->kernel_param = (struct r600_resource *)
 351                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 352                                         PIPE_USAGE_IMMUTABLE, input_size);
 353         }
 354
 355         u_box_1d(0, input_size, &box);
 356         num_work_groups_start = ctx_->transfer_map(ctx_,
 357                         (struct pipe_resource*)shader->kernel_param,
 358                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 359                         &box, &transfer);
 360         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 361         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 362         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 363
 364         /* Copy the work group size */
 365         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 366
 367         /* Copy the global size */
 368         for (i = 0; i < 3; i++) {
 369                 global_size_start[i] = grid_layout[i] * block_layout[i];
 370         }
 371
 372         /* Copy the local dimensions */
 373         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 374
 375         /* Copy the kernel inputs */
 376         memcpy(kernel_parameters_start, input, shader->input_size);
 377
 378         for (i = 0; i < (input_size / 4); i++) {
 379                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 380                         ((unsigned*)num_work_groups_start)[i]);
 381         }
 382
 383         ctx_->transfer_unmap(ctx_, transfer);
 384
 385         /* ID=0 is reserved for the parameters */
 386         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 387                         (struct pipe_resource*)shader->kernel_param);
 388 }
 389
 390 static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
 391                                            const uint *block_layout,
 392                                            const uint *grid_layout)
 393 {
 394         int i;
 395         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 396         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 397         unsigned num_waves;
 398         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 399         unsigned wave_divisor = (16 * num_pipes);
 400         int group_size = 1;
 401         int grid_size = 1;
 402         unsigned lds_size = shader->local_size / 4 +
 403                 shader->bc.nlds_dw;
 404
 405
 406         /* Calculate group_size/grid_size */
 407         for (i = 0; i < 3; i++) {
 408                 group_size *= block_layout[i];
 409         }
 410
 411         for (i = 0; i < 3; i++) {
 412                 grid_size *= grid_layout[i];
 413         }
 414
 415         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 416         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 417                         wave_divisor - 1) / wave_divisor;
 418
 419         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 420                                 "%u wavefronts per thread block, "
 421                                 "allocating %u dwords lds.\n",
 422                                 num_pipes, num_waves, lds_size);
 423
 424         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 425
 426         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 427         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 428         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 429         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 430
 431         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 432                                                                 group_size);
 433
 434         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 435         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 436         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 437         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 438
 439         if (rctx->b.chip_class < CAYMAN) {
 440                 assert(lds_size <= 8192);
 441         } else {
 442                 /* Cayman appears to have a slightly smaller limit, see the
 443                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 444                 assert(lds_size <= 8160);
 445         }
 446
 447         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 448                                         lds_size | (num_waves << 14));
 449
 450         /* Dispatch packet */
 451         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 452         radeon_emit(cs, grid_layout[0]);
 453         radeon_emit(cs, grid_layout[1]);
 454         radeon_emit(cs, grid_layout[2]);
 455         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 456         radeon_emit(cs, 1);
 457 }
 458
 459 static void compute_emit_cs(struct r600_context *ctx,
 460                             const uint *block_layout,
 461                             const uint *grid_layout)
 462 {
 463         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 464         unsigned i;
 465
 466         /* make sure that the gfx ring is only one active */
 467         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 468                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 469         }
 470
 471         /* Initialize all the compute-related registers.
 472          *
 473          * See evergreen_init_atom_start_compute_cs() in this file for the list
 474          * of registers initialized by the start_compute_cs_cmd atom.
 475          */
 476         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 477
 478         /* emit config state */
 479         if (ctx->b.chip_class == EVERGREEN)
 480                 r600_emit_atom(ctx, &ctx->config_state.atom);
 481
 482         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 483         r600_flush_emit(ctx);
 484
 485         /* Emit colorbuffers. */
 486         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 487         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 488                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 489                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 490                                                        (struct r600_resource*)cb->base.texture,
 491                                                        RADEON_USAGE_READWRITE,
 492                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 493
 494                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 495                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 496                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 497                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 498                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 499                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 500                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 501                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 502
 503                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 504                 radeon_emit(cs, reloc);
 505
 506                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 507                 radeon_emit(cs, reloc);
 508         }
 509         for (; i < 8 ; i++)
 510                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 511                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 512         for (; i < 12; i++)
 513                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 514                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 515
 516         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 517         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 518                                         ctx->compute_cb_target_mask);
 519
 520
 521         /* Emit vertex buffer state */
 522         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 523         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 524
 525         /* Emit constant buffer state */
 526         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 527
 528         /* Emit sampler state */
 529         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 530
 531         /* Emit sampler view (texture resource) state */
 532         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 533
 534         /* Emit compute shader state */
 535         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 536
 537         /* Emit dispatch state and dispatch packet */
 538         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 539
 540         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 541          */
 542         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 543                       R600_CONTEXT_INV_VERTEX_CACHE |
 544                       R600_CONTEXT_INV_TEX_CACHE;
 545         r600_flush_emit(ctx);
 546         ctx->b.flags = 0;
 547
 548         if (ctx->b.chip_class >= CAYMAN) {
 549                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 550                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 551                 /* DEALLOC_STATE prevents the GPU from hanging when a
 552                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 553                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 554                  */
 555                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 556                 cs->buf[cs->cdw++] = 0;
 557         }
 558
 559 #if 0
 560         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 561         for (i = 0; i < cs->cdw; i++) {
 562                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 563         }
 564 #endif
 565
 566 }
 567
 568
 569 /**
 570  * Emit function for r600_cs_shader_state atom
 571  */
 572 void evergreen_emit_cs_shader(struct r600_context *rctx,
 573                               struct r600_atom *atom)
 574 {
 575         struct r600_cs_shader_state *state =
 576                                         (struct r600_cs_shader_state*)atom;
 577         struct r600_pipe_compute *shader = state->shader;
 578         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 579         uint64_t va;
 580         struct r600_resource *code_bo;
 581         unsigned ngpr, nstack;
 582
 583         code_bo = shader->code_bo;
 584         va = shader->code_bo->gpu_address + state->pc;
 585         ngpr = shader->bc.ngpr;
 586         nstack = shader->bc.nstack;
 587
 588         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 589         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 590         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 591                         S_0288D4_NUM_GPRS(ngpr)
 592                         | S_0288D4_STACK_SIZE(nstack));
 593         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 594
 595         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 596         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 597                                               code_bo, RADEON_USAGE_READ,
 598                                               RADEON_PRIO_USER_SHADER));
 599 }
 600
 601 static void evergreen_launch_grid(struct pipe_context *ctx_,
 602                                   const struct pipe_grid_info *info)
 603 {
 604         struct r600_context *ctx = (struct r600_context *)ctx_;
 605 #ifdef HAVE_OPENCL
 606         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 607         boolean use_kill;
 608
 609         ctx->cs_shader_state.pc = info->pc;
 610         /* Get the config information for this kernel. */
 611         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 612                                   info->pc, &use_kill);
 613 #endif
 614
 615         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 616
 617
 618         evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
 619         compute_emit_cs(ctx, info->block, info->grid);
 620 }
 621
 622 static void evergreen_set_compute_resources(struct pipe_context *ctx_,
 623                                             unsigned start, unsigned count,
 624                                             struct pipe_surface **surfaces)
 625 {
 626         struct r600_context *ctx = (struct r600_context *)ctx_;
 627         struct r600_surface **resources = (struct r600_surface **)surfaces;
 628
 629         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 630                         start, count);
 631
 632         for (unsigned i = 0; i < count; i++) {
 633                 /* The First two vertex buffers are reserved for parameters and
 634                  * global buffers. */
 635                 unsigned vtx_id = 2 + i;
 636                 if (resources[i]) {
 637                         struct r600_resource_global *buffer =
 638                                 (struct r600_resource_global*)
 639                                 resources[i]->base.texture;
 640                         if (resources[i]->base.writable) {
 641                                 assert(i+1 < 12);
 642
 643                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 644                                 (struct r600_resource *)resources[i]->base.texture,
 645                                 buffer->chunk->start_in_dw*4,
 646                                 resources[i]->base.texture->width0);
 647                         }
 648
 649                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 650                                         buffer->chunk->start_in_dw * 4,
 651                                         resources[i]->base.texture);
 652                 }
 653         }
 654 }
 655
 656 static void evergreen_set_global_binding(struct pipe_context *ctx_,
 657                                          unsigned first, unsigned n,
 658                                          struct pipe_resource **resources,
 659                                          uint32_t **handles)
 660 {
 661         struct r600_context *ctx = (struct r600_context *)ctx_;
 662         struct compute_memory_pool *pool = ctx->screen->global_pool;
 663         struct r600_resource_global **buffers =
 664                 (struct r600_resource_global **)resources;
 665         unsigned i;
 666
 667         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 668                         first, n);
 669
 670         if (!resources) {
 671                 /* XXX: Unset */
 672                 return;
 673         }
 674
 675         /* We mark these items for promotion to the pool if they
 676          * aren't already there */
 677         for (i = first; i < first + n; i++) {
 678                 struct compute_memory_item *item = buffers[i]->chunk;
 679
 680                 if (!is_item_in_pool(item))
 681                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 682         }
 683
 684         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 685                 /* XXX: Unset */
 686                 return;
 687         }
 688
 689         for (i = first; i < first + n; i++)
 690         {
 691                 uint32_t buffer_offset;
 692                 uint32_t handle;
 693                 assert(resources[i]->target == PIPE_BUFFER);
 694                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 695
 696                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 697                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 698
 699                 *(handles[i]) = util_cpu_to_le32(handle);
 700         }
 701
 702         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 703         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 704                                 (struct pipe_resource*)pool->bo);
 705 }
 706
 707 /**
 708  * This function initializes all the compute specific registers that need to
 709  * be initialized for each compute command stream.  Registers that are common
 710  * to both compute and 3D will be initialized at the beginning of each compute
 711  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 712  * packet requires that the shader type bit be set, we must initialize all
 713  * context registers needed for compute in this function.  The registers
 714  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 715  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 716  * on the GPU family.
 717  */
 718 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 719 {
 720         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 721         int num_threads;
 722         int num_stack_entries;
 723
 724         /* since all required registers are initialized in the
 725          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 726          */
 727         r600_init_command_buffer(cb, 256);
 728         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 729
 730         /* This must be first. */
 731         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 732         r600_store_value(cb, 0x80000000);
 733         r600_store_value(cb, 0x80000000);
 734
 735         /* We're setting config registers here. */
 736         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 737         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 738
 739         switch (ctx->b.family) {
 740         case CHIP_CEDAR:
 741         default:
 742                 num_threads = 128;
 743                 num_stack_entries = 256;
 744                 break;
 745         case CHIP_REDWOOD:
 746                 num_threads = 128;
 747                 num_stack_entries = 256;
 748                 break;
 749         case CHIP_JUNIPER:
 750                 num_threads = 128;
 751                 num_stack_entries = 512;
 752                 break;
 753         case CHIP_CYPRESS:
 754         case CHIP_HEMLOCK:
 755                 num_threads = 128;
 756                 num_stack_entries = 512;
 757                 break;
 758         case CHIP_PALM:
 759                 num_threads = 128;
 760                 num_stack_entries = 256;
 761                 break;
 762         case CHIP_SUMO:
 763                 num_threads = 128;
 764                 num_stack_entries = 256;
 765                 break;
 766         case CHIP_SUMO2:
 767                 num_threads = 128;
 768                 num_stack_entries = 512;
 769                 break;
 770         case CHIP_BARTS:
 771                 num_threads = 128;
 772                 num_stack_entries = 512;
 773                 break;
 774         case CHIP_TURKS:
 775                 num_threads = 128;
 776                 num_stack_entries = 256;
 777                 break;
 778         case CHIP_CAICOS:
 779                 num_threads = 128;
 780                 num_stack_entries = 256;
 781                 break;
 782         }
 783
 784         /* Config Registers */
 785         if (ctx->b.chip_class < CAYMAN)
 786                 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
 787                                            ctx->screen->b.info.drm_minor);
 788         else
 789                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 790                                         ctx->screen->b.info.drm_minor);
 791
 792         /* The primitive type always needs to be POINTLIST for compute. */
 793         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 794                                                 V_008958_DI_PT_POINTLIST);
 795
 796         if (ctx->b.chip_class < CAYMAN) {
 797
 798                 /* These registers control which simds can be used by each stage.
 799                  * The default for these registers is 0xffffffff, which means
 800                  * all simds are available for each stage.  It's possible we may
 801                  * want to play around with these in the future, but for now
 802                  * the default value is fine.
 803                  *
 804                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 805                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 806                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 807                  */
 808
 809                 /* XXX: We may need to adjust the thread and stack resource
 810                  * values for 3D/compute interop */
 811
 812                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 813
 814                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 815                  * Set the number of threads used by the PS/VS/GS/ES stage to
 816                  * 0.
 817                  */
 818                 r600_store_value(cb, 0);
 819
 820                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 821                  * Set the number of threads used by the CS (aka LS) stage to
 822                  * the maximum number of threads and set the number of threads
 823                  * for the HS stage to 0. */
 824                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 825
 826                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 827                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 828                 r600_store_value(cb, 0);
 829
 830                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 831                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 832                 r600_store_value(cb, 0);
 833
 834                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 835                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 836                  * set it to the maximum value for the CS (aka LS) stage. */
 837                 r600_store_value(cb,
 838                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 839         }
 840         /* Give the compute shader all the available LDS space.
 841          * NOTE: This only sets the maximum number of dwords that a compute
 842          * shader can allocate.  When a shader is executed, we still need to
 843          * allocate the appropriate amount of LDS dwords using the
 844          * CM_R_0288E8_SQ_LDS_ALLOC register.
 845          */
 846         if (ctx->b.chip_class < CAYMAN) {
 847                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 848                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 849         } else {
 850                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 851                         S_0286FC_NUM_PS_LDS(0) |
 852                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 853         }
 854
 855         /* Context Registers */
 856
 857         if (ctx->b.chip_class < CAYMAN) {
 858                 /* workaround for hw issues with dyn gpr - must set all limits
 859                  * to 240 instead of 0, 0x1e == 240 / 8
 860                  */
 861                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 862                                 S_028838_PS_GPRS(0x1e) |
 863                                 S_028838_VS_GPRS(0x1e) |
 864                                 S_028838_GS_GPRS(0x1e) |
 865                                 S_028838_ES_GPRS(0x1e) |
 866                                 S_028838_HS_GPRS(0x1e) |
 867                                 S_028838_LS_GPRS(0x1e));
 868         }
 869
 870         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 871         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 872                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 873
 874         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 875
 876         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 877                                                 S_0286E8_TID_IN_GROUP_ENA
 878                                                 | S_0286E8_TGID_ENA
 879                                                 | S_0286E8_DISABLE_INDEX_PACK)
 880                                                 ;
 881
 882         /* The LOOP_CONST registers are an optimizations for loops that allows
 883          * you to store the initial counter, increment value, and maximum
 884          * counter value in a register so that hardware can calculate the
 885          * correct number of iterations for the loop, so that you don't need
 886          * to have the loop counter in your shader code.  We don't currently use
 887          * this optimization, so we must keep track of the counter in the
 888          * shader and use a break instruction to exit loops.  However, the
 889          * hardware will still uses this register to determine when to exit a
 890          * loop, so we need to initialize the counter to 0, set the increment
 891          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 892          * is the maximum value allowed.  This gives us a maximum of 4096
 893          * iterations for our loops, but hopefully our break instruction will
 894          * execute before some time before the 4096th iteration.
 895          */
 896         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 897 }
 898
 899 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 900 {
 901         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 902         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 903         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 904 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 905         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 906         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 907         ctx->b.b.launch_grid = evergreen_launch_grid;
 908
 909 }
 910
 911 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
 912                                                         const struct pipe_resource *templ)
 913 {
 914         struct r600_resource_global* result = NULL;
 915         struct r600_screen* rscreen = NULL;
 916         int size_in_dw = 0;
 917
 918         assert(templ->target == PIPE_BUFFER);
 919         assert(templ->bind & PIPE_BIND_GLOBAL);
 920         assert(templ->array_size == 1 || templ->array_size == 0);
 921         assert(templ->depth0 == 1 || templ->depth0 == 0);
 922         assert(templ->height0 == 1 || templ->height0 == 0);
 923
 924         result = (struct r600_resource_global*)
 925         CALLOC(sizeof(struct r600_resource_global), 1);
 926         rscreen = (struct r600_screen*)screen;
 927
 928         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 929         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 930                         templ->array_size);
 931
 932         result->base.b.vtbl = &r600_global_buffer_vtbl;
 933         result->base.b.b = *templ;
 934         result->base.b.b.screen = screen;
 935         pipe_reference_init(&result->base.b.b.reference, 1);
 936
 937         size_in_dw = (templ->width0+3) / 4;
 938
 939         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 940
 941         if (result->chunk == NULL)
 942         {
 943                 free(result);
 944                 return NULL;
 945         }
 946
 947         return &result->base.b.b;
 948 }
 949
 950 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 951                                         struct pipe_resource *res)
 952 {
 953         struct r600_resource_global* buffer = NULL;
 954         struct r600_screen* rscreen = NULL;
 955
 956         assert(res->target == PIPE_BUFFER);
 957         assert(res->bind & PIPE_BIND_GLOBAL);
 958
 959         buffer = (struct r600_resource_global*)res;
 960         rscreen = (struct r600_screen*)screen;
 961
 962         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 963
 964         buffer->chunk = NULL;
 965         free(res);
 966 }
 967
 968 void *r600_compute_global_transfer_map(struct pipe_context *ctx_,
 969                                        struct pipe_resource *resource,
 970                                        unsigned level,
 971                                        unsigned usage,
 972                                        const struct pipe_box *box,
 973                                        struct pipe_transfer **ptransfer)
 974 {
 975         struct r600_context *rctx = (struct r600_context*)ctx_;
 976         struct compute_memory_pool *pool = rctx->screen->global_pool;
 977         struct r600_resource_global* buffer =
 978                 (struct r600_resource_global*)resource;
 979
 980         struct compute_memory_item *item = buffer->chunk;
 981         struct pipe_resource *dst = NULL;
 982         unsigned offset = box->x;
 983
 984         if (is_item_in_pool(item)) {
 985                 compute_memory_demote_item(pool, item, ctx_);
 986         }
 987         else {
 988                 if (item->real_buffer == NULL) {
 989                         item->real_buffer =
 990                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 991                 }
 992         }
 993
 994         dst = (struct pipe_resource*)item->real_buffer;
 995
 996         if (usage & PIPE_TRANSFER_READ)
 997                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 998
 999         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1000                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1001                         "width = %u, height = %u, depth = %u)\n", level, usage,
1002                         box->x, box->y, box->z, box->width, box->height,
1003                         box->depth);
1004         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1005                 "%u (box.x)\n", item->id, box->x);
1006
1007
1008         assert(resource->target == PIPE_BUFFER);
1009         assert(resource->bind & PIPE_BIND_GLOBAL);
1010         assert(box->x >= 0);
1011         assert(box->y == 0);
1012         assert(box->z == 0);
1013
1014         ///TODO: do it better, mapping is not possible if the pool is too big
1015         return pipe_buffer_map_range(ctx_, dst,
1016                         offset, box->width, usage, ptransfer);
1017 }
1018
1019 void r600_compute_global_transfer_unmap(struct pipe_context *ctx_,
1020                                         struct pipe_transfer *transfer)
1021 {
1022         /* struct r600_resource_global are not real resources, they just map
1023          * to an offset within the compute memory pool.  The function
1024          * r600_compute_global_transfer_map() maps the memory pool
1025          * resource rather than the struct r600_resource_global passed to
1026          * it as an argument and then initalizes ptransfer->resource with
1027          * the memory pool resource (via pipe_buffer_map_range).
1028          * When transfer_unmap is called it uses the memory pool's
1029          * vtable which calls r600_buffer_transfer_map() rather than
1030          * this function.
1031          */
1032         assert (!"This function should not be called");
1033 }
1034
1035 void r600_compute_global_transfer_flush_region(struct pipe_context *ctx_,
1036                                                struct pipe_transfer *transfer,
1037                                                const struct pipe_box *box)
1038 {
1039         assert(0 && "TODO");
1040 }
1041
1042 void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
1043                                                struct pipe_resource *resource,
1044                                                unsigned level,
1045                                                unsigned usage,
1046                                                const struct pipe_box *box,
1047                                                const void *data,
1048                                                unsigned stride,
1049                                                unsigned layer_stride)
1050 {
1051         assert(0 && "TODO");
1052 }