src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #include "radeon/radeon_elf_util.h"
  50 #include <inttypes.h>
  51
  52 /**
  53 RAT0 is for global binding write
  54 VTX1 is for global binding read
  55
  56 for wrting images RAT1...
  57 for reading images TEX2...
  58   TEX2-RAT1 is paired
  59
  60 TEX2... consumes the same fetch resources, that VTX2... would consume
  61
  62 CONST0 and VTX0 is for parameters
  63   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  64   also constant cached
  65   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  66   the constant cache can handle
  67
  68 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  69 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  70 we should reserve another one too.=> 10 image binding for writing max.
  71
  72 from Nvidia OpenCL:
  73   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  74   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  75
  76 so 10 for writing is enough. 176 is the max for reading according to the docs
  77
  78 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  79 writable images will consume TEX slots, VTX slots too because of linear indexing
  80
  81 */
  82
  83 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  84                                                      unsigned size)
  85 {
  86         struct pipe_resource *buffer = NULL;
  87         assert(size);
  88
  89         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  90                                     0, PIPE_USAGE_IMMUTABLE, size);
  91
  92         return (struct r600_resource *)buffer;
  93 }
  94
  95
  96 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
  97                               unsigned id,
  98                               struct r600_resource *bo,
  99                               int start,
 100                               int size)
 101 {
 102         struct pipe_surface rat_templ;
 103         struct r600_surface *surf = NULL;
 104         struct r600_context *rctx = NULL;
 105
 106         assert(id < 12);
 107         assert((size & 3) == 0);
 108         assert((start & 0xFF) == 0);
 109
 110         rctx = pipe->ctx;
 111
 112         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 113
 114         /* Create the RAT surface */
 115         memset(&rat_templ, 0, sizeof(rat_templ));
 116         rat_templ.format = PIPE_FORMAT_R32_UINT;
 117         rat_templ.u.tex.level = 0;
 118         rat_templ.u.tex.first_layer = 0;
 119         rat_templ.u.tex.last_layer = 0;
 120
 121         /* Add the RAT the list of color buffers */
 122         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 123                 (struct pipe_context *)pipe->ctx,
 124                 (struct pipe_resource *)bo, &rat_templ);
 125
 126         /* Update the number of color buffers */
 127         pipe->ctx->framebuffer.state.nr_cbufs =
 128                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 129
 130         /* Update the cb_target_mask
 131          * XXX: I think this is a potential spot for bugs once we start doing
 132          * GL interop.  cb_target_mask may be modified in the 3D sections
 133          * of this driver. */
 134         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 135
 136         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 137         evergreen_init_color_surface_rat(rctx, surf);
 138 }
 139
 140 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 141                                            unsigned vb_index,
 142                                            unsigned offset,
 143                                            struct pipe_resource *buffer)
 144 {
 145         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 146         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 147         vb->stride = 1;
 148         vb->buffer_offset = offset;
 149         vb->buffer = buffer;
 150         vb->user_buffer = NULL;
 151
 152         /* The vertex instructions in the compute shaders use the texture cache,
 153          * so we need to invalidate it. */
 154         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 155         state->enabled_mask |= 1 << vb_index;
 156         state->dirty_mask |= 1 << vb_index;
 157         r600_mark_atom_dirty(rctx, &state->atom);
 158 }
 159
 160 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 161                                              unsigned cb_index,
 162                                              unsigned offset,
 163                                              unsigned size,
 164                                              struct pipe_resource *buffer)
 165 {
 166         struct pipe_constant_buffer cb;
 167         cb.buffer_size = size;
 168         cb.buffer_offset = offset;
 169         cb.buffer = buffer;
 170         cb.user_buffer = NULL;
 171
 172         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 173 }
 174
 175 /* We need to define these R600 registers here, because we can't include
 176  * evergreend.h and r600d.h.
 177  */
 178 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 179 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 180
 181 #ifdef HAVE_OPENCL
 182
 183 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 184                                            struct r600_bytecode *bc,
 185                                            uint64_t symbol_offset,
 186                                            boolean *use_kill)
 187 {
 188        unsigned i;
 189        const unsigned char *config =
 190                radeon_shader_binary_config_start(binary, symbol_offset);
 191
 192        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 193                unsigned reg =
 194                        util_le32_to_cpu(*(uint32_t*)(config + i));
 195                unsigned value =
 196                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 197                switch (reg) {
 198                /* R600 / R700 */
 199                case R_028850_SQ_PGM_RESOURCES_PS:
 200                case R_028868_SQ_PGM_RESOURCES_VS:
 201                /* Evergreen / Northern Islands */
 202                case R_028844_SQ_PGM_RESOURCES_PS:
 203                case R_028860_SQ_PGM_RESOURCES_VS:
 204                case R_0288D4_SQ_PGM_RESOURCES_LS:
 205                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 206                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 207                        break;
 208                case R_02880C_DB_SHADER_CONTROL:
 209                        *use_kill = G_02880C_KILL_ENABLE(value);
 210                        break;
 211                case R_0288E8_SQ_LDS_ALLOC:
 212                        bc->nlds_dw = value;
 213                        break;
 214                }
 215        }
 216 }
 217
 218 static unsigned r600_create_shader(struct r600_bytecode *bc,
 219                                    const struct ac_shader_binary *binary,
 220                                    boolean *use_kill)
 221
 222 {
 223         assert(binary->code_size % 4 == 0);
 224         bc->bytecode = CALLOC(1, binary->code_size);
 225         memcpy(bc->bytecode, binary->code, binary->code_size);
 226         bc->ndw = binary->code_size / 4;
 227
 228         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 229         return 0;
 230 }
 231
 232 #endif
 233
 234 static void r600_destroy_shader(struct r600_bytecode *bc)
 235 {
 236         FREE(bc->bytecode);
 237 }
 238
 239 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 240                                             const struct pipe_compute_state *cso)
 241 {
 242         struct r600_context *rctx = (struct r600_context *)ctx;
 243         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 244 #ifdef HAVE_OPENCL
 245         const struct pipe_llvm_program_header *header;
 246         const char *code;
 247         void *p;
 248         boolean use_kill;
 249
 250         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 251         header = cso->prog;
 252         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 253         radeon_shader_binary_init(&shader->binary);
 254         ac_elf_read(code, header->num_bytes, &shader->binary);
 255         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 256
 257         /* Upload code + ROdata */
 258         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 259                                                         shader->bc.ndw * 4);
 260         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 261         //TODO: use util_memcpy_cpu_to_le32 ?
 262         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 263         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 264 #endif
 265
 266         shader->ctx = rctx;
 267         shader->local_size = cso->req_local_mem;
 268         shader->private_size = cso->req_private_mem;
 269         shader->input_size = cso->req_input_mem;
 270
 271         return shader;
 272 }
 273
 274 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 275 {
 276         struct r600_context *rctx = (struct r600_context *)ctx;
 277         struct r600_pipe_compute *shader = state;
 278
 279         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 280
 281         if (!shader)
 282                 return;
 283
 284         radeon_shader_binary_clean(&shader->binary);
 285         r600_destroy_shader(&shader->bc);
 286
 287         /* TODO destroy shader->code_bo, shader->const_bo
 288          * we'll need something like r600_buffer_free */
 289         FREE(shader);
 290 }
 291
 292 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 293 {
 294         struct r600_context *rctx = (struct r600_context *)ctx;
 295
 296         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 297
 298         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 299 }
 300
 301 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 302  * kernel parameters there are implicit parameters that need to be stored
 303  * in the vertex buffer as well.  Here is how these parameters are organized in
 304  * the buffer:
 305  *
 306  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 307  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 308  * DWORDS 6-8: Number of work items within each work group in each dimension
 309  *             (x,y,z)
 310  * DWORDS 9+ : Kernel parameters
 311  */
 312 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 313                                            const struct pipe_grid_info *info)
 314 {
 315         struct r600_context *rctx = (struct r600_context *)ctx;
 316         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 317         unsigned i;
 318         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 319          * parameters.
 320          */
 321         unsigned input_size = shader->input_size + 36;
 322         uint32_t *num_work_groups_start;
 323         uint32_t *global_size_start;
 324         uint32_t *local_size_start;
 325         uint32_t *kernel_parameters_start;
 326         struct pipe_box box;
 327         struct pipe_transfer *transfer = NULL;
 328
 329         if (shader->input_size == 0) {
 330                 return;
 331         }
 332
 333         if (!shader->kernel_param) {
 334                 /* Add space for the grid dimensions */
 335                 shader->kernel_param = (struct r600_resource *)
 336                         pipe_buffer_create(ctx->screen, 0,
 337                                         PIPE_USAGE_IMMUTABLE, input_size);
 338         }
 339
 340         u_box_1d(0, input_size, &box);
 341         num_work_groups_start = ctx->transfer_map(ctx,
 342                         (struct pipe_resource*)shader->kernel_param,
 343                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 344                         &box, &transfer);
 345         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 346         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 347         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 348
 349         /* Copy the work group size */
 350         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 351
 352         /* Copy the global size */
 353         for (i = 0; i < 3; i++) {
 354                 global_size_start[i] = info->grid[i] * info->block[i];
 355         }
 356
 357         /* Copy the local dimensions */
 358         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 359
 360         /* Copy the kernel inputs */
 361         memcpy(kernel_parameters_start, info->input, shader->input_size);
 362
 363         for (i = 0; i < (input_size / 4); i++) {
 364                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 365                         ((unsigned*)num_work_groups_start)[i]);
 366         }
 367
 368         ctx->transfer_unmap(ctx, transfer);
 369
 370         /* ID=0 and ID=3 are reserved for the parameters.
 371          * LLVM will preferably use ID=0, but it does not work for dynamic
 372          * indices. */
 373         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 374                         (struct pipe_resource*)shader->kernel_param);
 375         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 376                         (struct pipe_resource*)shader->kernel_param);
 377 }
 378
 379 static void evergreen_emit_dispatch(struct r600_context *rctx,
 380                                     const struct pipe_grid_info *info)
 381 {
 382         int i;
 383         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 384         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 385         unsigned num_waves;
 386         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 387         unsigned wave_divisor = (16 * num_pipes);
 388         int group_size = 1;
 389         int grid_size = 1;
 390         unsigned lds_size = shader->local_size / 4 +
 391                 shader->bc.nlds_dw;
 392
 393
 394         /* Calculate group_size/grid_size */
 395         for (i = 0; i < 3; i++) {
 396                 group_size *= info->block[i];
 397         }
 398
 399         for (i = 0; i < 3; i++) {
 400                 grid_size *= info->grid[i];
 401         }
 402
 403         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 404         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 405                         wave_divisor - 1) / wave_divisor;
 406
 407         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 408                                 "%u wavefronts per thread block, "
 409                                 "allocating %u dwords lds.\n",
 410                                 num_pipes, num_waves, lds_size);
 411
 412         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 413
 414         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 415         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 416         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 417         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 418
 419         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 420                                                                 group_size);
 421
 422         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 423         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 424         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 425         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 426
 427         if (rctx->b.chip_class < CAYMAN) {
 428                 assert(lds_size <= 8192);
 429         } else {
 430                 /* Cayman appears to have a slightly smaller limit, see the
 431                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 432                 assert(lds_size <= 8160);
 433         }
 434
 435         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 436                                         lds_size | (num_waves << 14));
 437
 438         /* Dispatch packet */
 439         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 440         radeon_emit(cs, info->grid[0]);
 441         radeon_emit(cs, info->grid[1]);
 442         radeon_emit(cs, info->grid[2]);
 443         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 444         radeon_emit(cs, 1);
 445 }
 446
 447 static void compute_emit_cs(struct r600_context *rctx,
 448                             const struct pipe_grid_info *info)
 449 {
 450         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 451         unsigned i;
 452
 453         /* make sure that the gfx ring is only one active */
 454         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 455                 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 456         }
 457
 458         /* Initialize all the compute-related registers.
 459          *
 460          * See evergreen_init_atom_start_compute_cs() in this file for the list
 461          * of registers initialized by the start_compute_cs_cmd atom.
 462          */
 463         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 464
 465         /* emit config state */
 466         if (rctx->b.chip_class == EVERGREEN)
 467                 r600_emit_atom(rctx, &rctx->config_state.atom);
 468
 469         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 470         r600_flush_emit(rctx);
 471
 472         /* Emit colorbuffers. */
 473         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 474         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 475                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 476                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 477                                                        (struct r600_resource*)cb->base.texture,
 478                                                        RADEON_USAGE_READWRITE,
 479                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 480
 481                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 482                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 483                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 484                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 485                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 486                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 487                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 488                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 489
 490                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 491                 radeon_emit(cs, reloc);
 492
 493                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 494                 radeon_emit(cs, reloc);
 495         }
 496         for (; i < 8 ; i++)
 497                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 498                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 499         for (; i < 12; i++)
 500                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 501                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 502
 503         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 504         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 505                                         rctx->compute_cb_target_mask);
 506
 507
 508         /* Emit vertex buffer state */
 509         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 510         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 511
 512         /* Emit constant buffer state */
 513         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 514
 515         /* Emit sampler state */
 516         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 517
 518         /* Emit sampler view (texture resource) state */
 519         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 520
 521         /* Emit compute shader state */
 522         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 523
 524         /* Emit dispatch state and dispatch packet */
 525         evergreen_emit_dispatch(rctx, info);
 526
 527         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 528          */
 529         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 530                       R600_CONTEXT_INV_VERTEX_CACHE |
 531                       R600_CONTEXT_INV_TEX_CACHE;
 532         r600_flush_emit(rctx);
 533         rctx->b.flags = 0;
 534
 535         if (rctx->b.chip_class >= CAYMAN) {
 536                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 537                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 538                 /* DEALLOC_STATE prevents the GPU from hanging when a
 539                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 540                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 541                  */
 542                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 543                 radeon_emit(cs, 0);
 544         }
 545
 546 #if 0
 547         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 548         for (i = 0; i < cs->cdw; i++) {
 549                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 550         }
 551 #endif
 552
 553 }
 554
 555
 556 /**
 557  * Emit function for r600_cs_shader_state atom
 558  */
 559 void evergreen_emit_cs_shader(struct r600_context *rctx,
 560                               struct r600_atom *atom)
 561 {
 562         struct r600_cs_shader_state *state =
 563                                         (struct r600_cs_shader_state*)atom;
 564         struct r600_pipe_compute *shader = state->shader;
 565         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 566         uint64_t va;
 567         struct r600_resource *code_bo;
 568         unsigned ngpr, nstack;
 569
 570         code_bo = shader->code_bo;
 571         va = shader->code_bo->gpu_address + state->pc;
 572         ngpr = shader->bc.ngpr;
 573         nstack = shader->bc.nstack;
 574
 575         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 576         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 577         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 578                         S_0288D4_NUM_GPRS(ngpr)
 579                         | S_0288D4_STACK_SIZE(nstack));
 580         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 581
 582         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 583         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 584                                               code_bo, RADEON_USAGE_READ,
 585                                               RADEON_PRIO_SHADER_BINARY));
 586 }
 587
 588 static void evergreen_launch_grid(struct pipe_context *ctx,
 589                                   const struct pipe_grid_info *info)
 590 {
 591         struct r600_context *rctx = (struct r600_context *)ctx;
 592 #ifdef HAVE_OPENCL
 593         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 594         boolean use_kill;
 595
 596         rctx->cs_shader_state.pc = info->pc;
 597         /* Get the config information for this kernel. */
 598         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 599                                   info->pc, &use_kill);
 600 #endif
 601
 602         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 603
 604
 605         evergreen_compute_upload_input(ctx, info);
 606         compute_emit_cs(rctx, info);
 607 }
 608
 609 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 610                                             unsigned start, unsigned count,
 611                                             struct pipe_surface **surfaces)
 612 {
 613         struct r600_context *rctx = (struct r600_context *)ctx;
 614         struct r600_surface **resources = (struct r600_surface **)surfaces;
 615
 616         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 617                         start, count);
 618
 619         for (unsigned i = 0; i < count; i++) {
 620                 /* The First four vertex buffers are reserved for parameters and
 621                  * global buffers. */
 622                 unsigned vtx_id = 4 + i;
 623                 if (resources[i]) {
 624                         struct r600_resource_global *buffer =
 625                                 (struct r600_resource_global*)
 626                                 resources[i]->base.texture;
 627                         if (resources[i]->base.writable) {
 628                                 assert(i+1 < 12);
 629
 630                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 631                                 (struct r600_resource *)resources[i]->base.texture,
 632                                 buffer->chunk->start_in_dw*4,
 633                                 resources[i]->base.texture->width0);
 634                         }
 635
 636                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 637                                         buffer->chunk->start_in_dw * 4,
 638                                         resources[i]->base.texture);
 639                 }
 640         }
 641 }
 642
 643 static void evergreen_set_global_binding(struct pipe_context *ctx,
 644                                          unsigned first, unsigned n,
 645                                          struct pipe_resource **resources,
 646                                          uint32_t **handles)
 647 {
 648         struct r600_context *rctx = (struct r600_context *)ctx;
 649         struct compute_memory_pool *pool = rctx->screen->global_pool;
 650         struct r600_resource_global **buffers =
 651                 (struct r600_resource_global **)resources;
 652         unsigned i;
 653
 654         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 655                         first, n);
 656
 657         if (!resources) {
 658                 /* XXX: Unset */
 659                 return;
 660         }
 661
 662         /* We mark these items for promotion to the pool if they
 663          * aren't already there */
 664         for (i = first; i < first + n; i++) {
 665                 struct compute_memory_item *item = buffers[i]->chunk;
 666
 667                 if (!is_item_in_pool(item))
 668                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 669         }
 670
 671         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 672                 /* XXX: Unset */
 673                 return;
 674         }
 675
 676         for (i = first; i < first + n; i++)
 677         {
 678                 uint32_t buffer_offset;
 679                 uint32_t handle;
 680                 assert(resources[i]->target == PIPE_BUFFER);
 681                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 682
 683                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 684                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 685
 686                 *(handles[i]) = util_cpu_to_le32(handle);
 687         }
 688
 689         /* globals for writing */
 690         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 691         /* globals for reading */
 692         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 693                                 (struct pipe_resource*)pool->bo);
 694
 695         /* constants for reading, LLVM puts them in text segment */
 696         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 697                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 698 }
 699
 700 /**
 701  * This function initializes all the compute specific registers that need to
 702  * be initialized for each compute command stream.  Registers that are common
 703  * to both compute and 3D will be initialized at the beginning of each compute
 704  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 705  * packet requires that the shader type bit be set, we must initialize all
 706  * context registers needed for compute in this function.  The registers
 707  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 708  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 709  * on the GPU family.
 710  */
 711 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 712 {
 713         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 714         int num_threads;
 715         int num_stack_entries;
 716
 717         /* since all required registers are initialized in the
 718          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 719          */
 720         r600_init_command_buffer(cb, 256);
 721         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 722
 723         /* This must be first. */
 724         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 725         r600_store_value(cb, 0x80000000);
 726         r600_store_value(cb, 0x80000000);
 727
 728         /* We're setting config registers here. */
 729         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 730         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 731
 732         switch (rctx->b.family) {
 733         case CHIP_CEDAR:
 734         default:
 735                 num_threads = 128;
 736                 num_stack_entries = 256;
 737                 break;
 738         case CHIP_REDWOOD:
 739                 num_threads = 128;
 740                 num_stack_entries = 256;
 741                 break;
 742         case CHIP_JUNIPER:
 743                 num_threads = 128;
 744                 num_stack_entries = 512;
 745                 break;
 746         case CHIP_CYPRESS:
 747         case CHIP_HEMLOCK:
 748                 num_threads = 128;
 749                 num_stack_entries = 512;
 750                 break;
 751         case CHIP_PALM:
 752                 num_threads = 128;
 753                 num_stack_entries = 256;
 754                 break;
 755         case CHIP_SUMO:
 756                 num_threads = 128;
 757                 num_stack_entries = 256;
 758                 break;
 759         case CHIP_SUMO2:
 760                 num_threads = 128;
 761                 num_stack_entries = 512;
 762                 break;
 763         case CHIP_BARTS:
 764                 num_threads = 128;
 765                 num_stack_entries = 512;
 766                 break;
 767         case CHIP_TURKS:
 768                 num_threads = 128;
 769                 num_stack_entries = 256;
 770                 break;
 771         case CHIP_CAICOS:
 772                 num_threads = 128;
 773                 num_stack_entries = 256;
 774                 break;
 775         }
 776
 777         /* Config Registers */
 778         if (rctx->b.chip_class < CAYMAN)
 779                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 780                                            rctx->screen->b.info.drm_minor);
 781         else
 782                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 783                                         rctx->screen->b.info.drm_minor);
 784
 785         /* The primitive type always needs to be POINTLIST for compute. */
 786         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 787                                                 V_008958_DI_PT_POINTLIST);
 788
 789         if (rctx->b.chip_class < CAYMAN) {
 790
 791                 /* These registers control which simds can be used by each stage.
 792                  * The default for these registers is 0xffffffff, which means
 793                  * all simds are available for each stage.  It's possible we may
 794                  * want to play around with these in the future, but for now
 795                  * the default value is fine.
 796                  *
 797                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 798                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 799                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 800                  */
 801
 802                 /* XXX: We may need to adjust the thread and stack resource
 803                  * values for 3D/compute interop */
 804
 805                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 806
 807                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 808                  * Set the number of threads used by the PS/VS/GS/ES stage to
 809                  * 0.
 810                  */
 811                 r600_store_value(cb, 0);
 812
 813                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 814                  * Set the number of threads used by the CS (aka LS) stage to
 815                  * the maximum number of threads and set the number of threads
 816                  * for the HS stage to 0. */
 817                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 818
 819                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 820                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 821                 r600_store_value(cb, 0);
 822
 823                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 824                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 825                 r600_store_value(cb, 0);
 826
 827                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 828                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 829                  * set it to the maximum value for the CS (aka LS) stage. */
 830                 r600_store_value(cb,
 831                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 832         }
 833         /* Give the compute shader all the available LDS space.
 834          * NOTE: This only sets the maximum number of dwords that a compute
 835          * shader can allocate.  When a shader is executed, we still need to
 836          * allocate the appropriate amount of LDS dwords using the
 837          * CM_R_0288E8_SQ_LDS_ALLOC register.
 838          */
 839         if (rctx->b.chip_class < CAYMAN) {
 840                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 841                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 842         } else {
 843                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 844                         S_0286FC_NUM_PS_LDS(0) |
 845                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 846         }
 847
 848         /* Context Registers */
 849
 850         if (rctx->b.chip_class < CAYMAN) {
 851                 /* workaround for hw issues with dyn gpr - must set all limits
 852                  * to 240 instead of 0, 0x1e == 240 / 8
 853                  */
 854                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 855                                 S_028838_PS_GPRS(0x1e) |
 856                                 S_028838_VS_GPRS(0x1e) |
 857                                 S_028838_GS_GPRS(0x1e) |
 858                                 S_028838_ES_GPRS(0x1e) |
 859                                 S_028838_HS_GPRS(0x1e) |
 860                                 S_028838_LS_GPRS(0x1e));
 861         }
 862
 863         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 864         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 865                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 866
 867         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 868
 869         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 870                                                 S_0286E8_TID_IN_GROUP_ENA
 871                                                 | S_0286E8_TGID_ENA
 872                                                 | S_0286E8_DISABLE_INDEX_PACK)
 873                                                 ;
 874
 875         /* The LOOP_CONST registers are an optimizations for loops that allows
 876          * you to store the initial counter, increment value, and maximum
 877          * counter value in a register so that hardware can calculate the
 878          * correct number of iterations for the loop, so that you don't need
 879          * to have the loop counter in your shader code.  We don't currently use
 880          * this optimization, so we must keep track of the counter in the
 881          * shader and use a break instruction to exit loops.  However, the
 882          * hardware will still uses this register to determine when to exit a
 883          * loop, so we need to initialize the counter to 0, set the increment
 884          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 885          * is the maximum value allowed.  This gives us a maximum of 4096
 886          * iterations for our loops, but hopefully our break instruction will
 887          * execute before some time before the 4096th iteration.
 888          */
 889         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 890 }
 891
 892 void evergreen_init_compute_state_functions(struct r600_context *rctx)
 893 {
 894         rctx->b.b.create_compute_state = evergreen_create_compute_state;
 895         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 896         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 897 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 898         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 899         rctx->b.b.set_global_binding = evergreen_set_global_binding;
 900         rctx->b.b.launch_grid = evergreen_launch_grid;
 901
 902 }
 903
 904 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 905                                               struct pipe_resource *resource,
 906                                               unsigned level,
 907                                               unsigned usage,
 908                                               const struct pipe_box *box,
 909                                               struct pipe_transfer **ptransfer)
 910 {
 911         struct r600_context *rctx = (struct r600_context*)ctx;
 912         struct compute_memory_pool *pool = rctx->screen->global_pool;
 913         struct r600_resource_global* buffer =
 914                 (struct r600_resource_global*)resource;
 915
 916         struct compute_memory_item *item = buffer->chunk;
 917         struct pipe_resource *dst = NULL;
 918         unsigned offset = box->x;
 919
 920         if (is_item_in_pool(item)) {
 921                 compute_memory_demote_item(pool, item, ctx);
 922         }
 923         else {
 924                 if (item->real_buffer == NULL) {
 925                         item->real_buffer =
 926                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 927                 }
 928         }
 929
 930         dst = (struct pipe_resource*)item->real_buffer;
 931
 932         if (usage & PIPE_TRANSFER_READ)
 933                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 934
 935         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 936                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 937                         "width = %u, height = %u, depth = %u)\n", level, usage,
 938                         box->x, box->y, box->z, box->width, box->height,
 939                         box->depth);
 940         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 941                 "%u (box.x)\n", item->id, box->x);
 942
 943
 944         assert(resource->target == PIPE_BUFFER);
 945         assert(resource->bind & PIPE_BIND_GLOBAL);
 946         assert(box->x >= 0);
 947         assert(box->y == 0);
 948         assert(box->z == 0);
 949
 950         ///TODO: do it better, mapping is not possible if the pool is too big
 951         return pipe_buffer_map_range(ctx, dst,
 952                         offset, box->width, usage, ptransfer);
 953 }
 954
 955 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 956                                                struct pipe_transfer *transfer)
 957 {
 958         /* struct r600_resource_global are not real resources, they just map
 959          * to an offset within the compute memory pool.  The function
 960          * r600_compute_global_transfer_map() maps the memory pool
 961          * resource rather than the struct r600_resource_global passed to
 962          * it as an argument and then initalizes ptransfer->resource with
 963          * the memory pool resource (via pipe_buffer_map_range).
 964          * When transfer_unmap is called it uses the memory pool's
 965          * vtable which calls r600_buffer_transfer_map() rather than
 966          * this function.
 967          */
 968         assert (!"This function should not be called");
 969 }
 970
 971 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
 972                                                       struct pipe_transfer *transfer,
 973                                                       const struct pipe_box *box)
 974 {
 975         assert(0 && "TODO");
 976 }
 977
 978 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 979                                                struct pipe_resource *res)
 980 {
 981         struct r600_resource_global* buffer = NULL;
 982         struct r600_screen* rscreen = NULL;
 983
 984         assert(res->target == PIPE_BUFFER);
 985         assert(res->bind & PIPE_BIND_GLOBAL);
 986
 987         buffer = (struct r600_resource_global*)res;
 988         rscreen = (struct r600_screen*)screen;
 989
 990         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 991
 992         buffer->chunk = NULL;
 993         free(res);
 994 }
 995
 996 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 997 {
 998         u_default_resource_get_handle, /* get_handle */
 999         r600_compute_global_buffer_destroy, /* resource_destroy */
1000         r600_compute_global_transfer_map, /* transfer_map */
1001         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1002         r600_compute_global_transfer_unmap, /* transfer_unmap */
1003 };
1004
1005 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1006                                                         const struct pipe_resource *templ)
1007 {
1008         struct r600_resource_global* result = NULL;
1009         struct r600_screen* rscreen = NULL;
1010         int size_in_dw = 0;
1011
1012         assert(templ->target == PIPE_BUFFER);
1013         assert(templ->bind & PIPE_BIND_GLOBAL);
1014         assert(templ->array_size == 1 || templ->array_size == 0);
1015         assert(templ->depth0 == 1 || templ->depth0 == 0);
1016         assert(templ->height0 == 1 || templ->height0 == 0);
1017
1018         result = (struct r600_resource_global*)
1019         CALLOC(sizeof(struct r600_resource_global), 1);
1020         rscreen = (struct r600_screen*)screen;
1021
1022         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1023         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1024                         templ->array_size);
1025
1026         result->base.b.vtbl = &r600_global_buffer_vtbl;
1027         result->base.b.b = *templ;
1028         result->base.b.b.screen = screen;
1029         pipe_reference_init(&result->base.b.b.reference, 1);
1030
1031         size_in_dw = (templ->width0+3) / 4;
1032
1033         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1034
1035         if (result->chunk == NULL)
1036         {
1037                 free(result);
1038                 return NULL;
1039         }
1040
1041         return &result->base.b.b;
1042 }