src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "ac_binary.h"
  30 #include "pipe/p_defines.h"
  31 #include "pipe/p_state.h"
  32 #include "pipe/p_context.h"
  33 #include "util/u_blitter.h"
  34 #include "util/list.h"
  35 #include "util/u_transfer.h"
  36 #include "util/u_surface.h"
  37 #include "util/u_pack_color.h"
  38 #include "util/u_memory.h"
  39 #include "util/u_inlines.h"
  40 #include "util/u_framebuffer.h"
  41 #include "pipebuffer/pb_buffer.h"
  42 #include "evergreend.h"
  43 #include "r600_shader.h"
  44 #include "r600_pipe.h"
  45 #include "r600_formats.h"
  46 #include "evergreen_compute.h"
  47 #include "evergreen_compute_internal.h"
  48 #include "compute_memory_pool.h"
  49 #include "sb/sb_public.h"
  50 #include <inttypes.h>
  51
  52 /**
  53 RAT0 is for global binding write
  54 VTX1 is for global binding read
  55
  56 for wrting images RAT1...
  57 for reading images TEX2...
  58   TEX2-RAT1 is paired
  59
  60 TEX2... consumes the same fetch resources, that VTX2... would consume
  61
  62 CONST0 and VTX0 is for parameters
  63   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  64   also constant cached
  65   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  66   the constant cache can handle
  67
  68 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  69 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  70 we should reserve another one too.=> 10 image binding for writing max.
  71
  72 from Nvidia OpenCL:
  73   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  74   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  75
  76 so 10 for writing is enough. 176 is the max for reading according to the docs
  77
  78 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  79 writable images will consume TEX slots, VTX slots too because of linear indexing
  80
  81 */
  82
  83 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  84                                                      unsigned size)
  85 {
  86         struct pipe_resource *buffer = NULL;
  87         assert(size);
  88
  89         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  90                                     0, PIPE_USAGE_IMMUTABLE, size);
  91
  92         return (struct r600_resource *)buffer;
  93 }
  94
  95
  96 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
  97                               unsigned id,
  98                               struct r600_resource *bo,
  99                               int start,
 100                               int size)
 101 {
 102         struct pipe_surface rat_templ;
 103         struct r600_surface *surf = NULL;
 104         struct r600_context *rctx = NULL;
 105
 106         assert(id < 12);
 107         assert((size & 3) == 0);
 108         assert((start & 0xFF) == 0);
 109
 110         rctx = pipe->ctx;
 111
 112         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 113
 114         /* Create the RAT surface */
 115         memset(&rat_templ, 0, sizeof(rat_templ));
 116         rat_templ.format = PIPE_FORMAT_R32_UINT;
 117         rat_templ.u.tex.level = 0;
 118         rat_templ.u.tex.first_layer = 0;
 119         rat_templ.u.tex.last_layer = 0;
 120
 121         /* Add the RAT the list of color buffers */
 122         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 123                 (struct pipe_context *)pipe->ctx,
 124                 (struct pipe_resource *)bo, &rat_templ);
 125
 126         /* Update the number of color buffers */
 127         pipe->ctx->framebuffer.state.nr_cbufs =
 128                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 129
 130         /* Update the cb_target_mask
 131          * XXX: I think this is a potential spot for bugs once we start doing
 132          * GL interop.  cb_target_mask may be modified in the 3D sections
 133          * of this driver. */
 134         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 135
 136         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 137         evergreen_init_color_surface_rat(rctx, surf);
 138 }
 139
 140 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 141                                            unsigned vb_index,
 142                                            unsigned offset,
 143                                            struct pipe_resource *buffer)
 144 {
 145         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 146         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 147         vb->stride = 1;
 148         vb->buffer_offset = offset;
 149         vb->buffer.resource = buffer;
 150         vb->is_user_buffer = false;
 151
 152         /* The vertex instructions in the compute shaders use the texture cache,
 153          * so we need to invalidate it. */
 154         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 155         state->enabled_mask |= 1 << vb_index;
 156         state->dirty_mask |= 1 << vb_index;
 157         r600_mark_atom_dirty(rctx, &state->atom);
 158 }
 159
 160 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 161                                              unsigned cb_index,
 162                                              unsigned offset,
 163                                              unsigned size,
 164                                              struct pipe_resource *buffer)
 165 {
 166         struct pipe_constant_buffer cb;
 167         cb.buffer_size = size;
 168         cb.buffer_offset = offset;
 169         cb.buffer = buffer;
 170         cb.user_buffer = NULL;
 171
 172         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 173 }
 174
 175 /* We need to define these R600 registers here, because we can't include
 176  * evergreend.h and r600d.h.
 177  */
 178 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 179 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 180
 181 #ifdef HAVE_OPENCL
 182
 183 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 184                                            struct r600_bytecode *bc,
 185                                            uint64_t symbol_offset,
 186                                            boolean *use_kill)
 187 {
 188        unsigned i;
 189        const unsigned char *config =
 190                ac_shader_binary_config_start(binary, symbol_offset);
 191
 192        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 193                unsigned reg =
 194                        util_le32_to_cpu(*(uint32_t*)(config + i));
 195                unsigned value =
 196                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 197                switch (reg) {
 198                /* R600 / R700 */
 199                case R_028850_SQ_PGM_RESOURCES_PS:
 200                case R_028868_SQ_PGM_RESOURCES_VS:
 201                /* Evergreen / Northern Islands */
 202                case R_028844_SQ_PGM_RESOURCES_PS:
 203                case R_028860_SQ_PGM_RESOURCES_VS:
 204                case R_0288D4_SQ_PGM_RESOURCES_LS:
 205                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 206                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 207                        break;
 208                case R_02880C_DB_SHADER_CONTROL:
 209                        *use_kill = G_02880C_KILL_ENABLE(value);
 210                        break;
 211                case R_0288E8_SQ_LDS_ALLOC:
 212                        bc->nlds_dw = value;
 213                        break;
 214                }
 215        }
 216 }
 217
 218 static unsigned r600_create_shader(struct r600_bytecode *bc,
 219                                    const struct ac_shader_binary *binary,
 220                                    boolean *use_kill)
 221
 222 {
 223         assert(binary->code_size % 4 == 0);
 224         bc->bytecode = CALLOC(1, binary->code_size);
 225         memcpy(bc->bytecode, binary->code, binary->code_size);
 226         bc->ndw = binary->code_size / 4;
 227
 228         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 229         return 0;
 230 }
 231
 232 #endif
 233
 234 static void r600_destroy_shader(struct r600_bytecode *bc)
 235 {
 236         FREE(bc->bytecode);
 237 }
 238
 239 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 240                                             const struct pipe_compute_state *cso)
 241 {
 242         struct r600_context *rctx = (struct r600_context *)ctx;
 243         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 244 #ifdef HAVE_OPENCL
 245         const struct pipe_llvm_program_header *header;
 246         const char *code;
 247         void *p;
 248         boolean use_kill;
 249
 250         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 251         header = cso->prog;
 252         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 253         radeon_shader_binary_init(&shader->binary);
 254         ac_elf_read(code, header->num_bytes, &shader->binary);
 255         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 256
 257         /* Upload code + ROdata */
 258         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 259                                                         shader->bc.ndw * 4);
 260         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 261         //TODO: use util_memcpy_cpu_to_le32 ?
 262         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 263         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 264 #endif
 265
 266         shader->ctx = rctx;
 267         shader->local_size = cso->req_local_mem;
 268         shader->private_size = cso->req_private_mem;
 269         shader->input_size = cso->req_input_mem;
 270
 271         return shader;
 272 }
 273
 274 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 275 {
 276         struct r600_context *rctx = (struct r600_context *)ctx;
 277         struct r600_pipe_compute *shader = state;
 278
 279         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 280
 281         if (!shader)
 282                 return;
 283
 284         radeon_shader_binary_clean(&shader->binary);
 285         r600_destroy_shader(&shader->bc);
 286
 287         /* TODO destroy shader->code_bo, shader->const_bo
 288          * we'll need something like r600_buffer_free */
 289         FREE(shader);
 290 }
 291
 292 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 293 {
 294         struct r600_context *rctx = (struct r600_context *)ctx;
 295
 296         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 297
 298         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 299 }
 300
 301 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 302  * kernel parameters there are implicit parameters that need to be stored
 303  * in the vertex buffer as well.  Here is how these parameters are organized in
 304  * the buffer:
 305  *
 306  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 307  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 308  * DWORDS 6-8: Number of work items within each work group in each dimension
 309  *             (x,y,z)
 310  * DWORDS 9+ : Kernel parameters
 311  */
 312 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 313                                            const struct pipe_grid_info *info)
 314 {
 315         struct r600_context *rctx = (struct r600_context *)ctx;
 316         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 317         unsigned i;
 318         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 319          * parameters.
 320          */
 321         unsigned input_size = shader->input_size + 36;
 322         uint32_t *num_work_groups_start;
 323         uint32_t *global_size_start;
 324         uint32_t *local_size_start;
 325         uint32_t *kernel_parameters_start;
 326         struct pipe_box box;
 327         struct pipe_transfer *transfer = NULL;
 328
 329         if (shader->input_size == 0) {
 330                 return;
 331         }
 332
 333         if (!shader->kernel_param) {
 334                 /* Add space for the grid dimensions */
 335                 shader->kernel_param = (struct r600_resource *)
 336                         pipe_buffer_create(ctx->screen, 0,
 337                                         PIPE_USAGE_IMMUTABLE, input_size);
 338         }
 339
 340         u_box_1d(0, input_size, &box);
 341         num_work_groups_start = ctx->transfer_map(ctx,
 342                         (struct pipe_resource*)shader->kernel_param,
 343                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 344                         &box, &transfer);
 345         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 346         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 347         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 348
 349         /* Copy the work group size */
 350         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 351
 352         /* Copy the global size */
 353         for (i = 0; i < 3; i++) {
 354                 global_size_start[i] = info->grid[i] * info->block[i];
 355         }
 356
 357         /* Copy the local dimensions */
 358         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 359
 360         /* Copy the kernel inputs */
 361         memcpy(kernel_parameters_start, info->input, shader->input_size);
 362
 363         for (i = 0; i < (input_size / 4); i++) {
 364                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 365                         ((unsigned*)num_work_groups_start)[i]);
 366         }
 367
 368         ctx->transfer_unmap(ctx, transfer);
 369
 370         /* ID=0 and ID=3 are reserved for the parameters.
 371          * LLVM will preferably use ID=0, but it does not work for dynamic
 372          * indices. */
 373         evergreen_cs_set_vertex_buffer(rctx, 3, 0,
 374                         (struct pipe_resource*)shader->kernel_param);
 375         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 376                         (struct pipe_resource*)shader->kernel_param);
 377 }
 378
 379 static void evergreen_emit_dispatch(struct r600_context *rctx,
 380                                     const struct pipe_grid_info *info)
 381 {
 382         int i;
 383         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 384         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 385         unsigned num_waves;
 386         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 387         unsigned wave_divisor = (16 * num_pipes);
 388         int group_size = 1;
 389         int grid_size = 1;
 390         unsigned lds_size = shader->local_size / 4 +
 391                 shader->bc.nlds_dw;
 392
 393
 394         /* Calculate group_size/grid_size */
 395         for (i = 0; i < 3; i++) {
 396                 group_size *= info->block[i];
 397         }
 398
 399         for (i = 0; i < 3; i++) {
 400                 grid_size *= info->grid[i];
 401         }
 402
 403         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 404         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 405                         wave_divisor - 1) / wave_divisor;
 406
 407         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 408                                 "%u wavefronts per thread block, "
 409                                 "allocating %u dwords lds.\n",
 410                                 num_pipes, num_waves, lds_size);
 411
 412         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 413
 414         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 415         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 416         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 417         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 418
 419         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 420                                                                 group_size);
 421
 422         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 423         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 424         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 425         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 426
 427         if (rctx->b.chip_class < CAYMAN) {
 428                 assert(lds_size <= 8192);
 429         } else {
 430                 /* Cayman appears to have a slightly smaller limit, see the
 431                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 432                 assert(lds_size <= 8160);
 433         }
 434
 435         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 436                                         lds_size | (num_waves << 14));
 437
 438         /* Dispatch packet */
 439         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 440         radeon_emit(cs, info->grid[0]);
 441         radeon_emit(cs, info->grid[1]);
 442         radeon_emit(cs, info->grid[2]);
 443         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 444         radeon_emit(cs, 1);
 445
 446         if (rctx->is_debug)
 447                 eg_trace_emit(rctx);
 448 }
 449
 450 static void compute_emit_cs(struct r600_context *rctx,
 451                             const struct pipe_grid_info *info)
 452 {
 453         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 454         unsigned i;
 455
 456         /* make sure that the gfx ring is only one active */
 457         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 458                 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 459         }
 460
 461         /* Initialize all the compute-related registers.
 462          *
 463          * See evergreen_init_atom_start_compute_cs() in this file for the list
 464          * of registers initialized by the start_compute_cs_cmd atom.
 465          */
 466         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 467
 468         /* emit config state */
 469         if (rctx->b.chip_class == EVERGREEN)
 470                 r600_emit_atom(rctx, &rctx->config_state.atom);
 471
 472         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 473         r600_flush_emit(rctx);
 474
 475         /* Emit colorbuffers. */
 476         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 477         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 478                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 479                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 480                                                        (struct r600_resource*)cb->base.texture,
 481                                                        RADEON_USAGE_READWRITE,
 482                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 483
 484                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 485                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 486                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 487                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 488                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 489                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 490                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 491                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 492
 493                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 494                 radeon_emit(cs, reloc);
 495
 496                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 497                 radeon_emit(cs, reloc);
 498         }
 499         for (; i < 8 ; i++)
 500                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 501                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 502         for (; i < 12; i++)
 503                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 504                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 505
 506         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 507         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 508                                         rctx->compute_cb_target_mask);
 509
 510
 511         /* Emit vertex buffer state */
 512         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 513         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 514
 515         /* Emit constant buffer state */
 516         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 517
 518         /* Emit sampler state */
 519         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 520
 521         /* Emit sampler view (texture resource) state */
 522         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 523
 524         /* Emit compute shader state */
 525         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 526
 527         /* Emit dispatch state and dispatch packet */
 528         evergreen_emit_dispatch(rctx, info);
 529
 530         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 531          */
 532         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 533                       R600_CONTEXT_INV_VERTEX_CACHE |
 534                       R600_CONTEXT_INV_TEX_CACHE;
 535         r600_flush_emit(rctx);
 536         rctx->b.flags = 0;
 537
 538         if (rctx->b.chip_class >= CAYMAN) {
 539                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 540                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 541                 /* DEALLOC_STATE prevents the GPU from hanging when a
 542                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 543                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 544                  */
 545                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 546                 radeon_emit(cs, 0);
 547         }
 548
 549 #if 0
 550         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 551         for (i = 0; i < cs->cdw; i++) {
 552                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 553         }
 554 #endif
 555
 556 }
 557
 558
 559 /**
 560  * Emit function for r600_cs_shader_state atom
 561  */
 562 void evergreen_emit_cs_shader(struct r600_context *rctx,
 563                               struct r600_atom *atom)
 564 {
 565         struct r600_cs_shader_state *state =
 566                                         (struct r600_cs_shader_state*)atom;
 567         struct r600_pipe_compute *shader = state->shader;
 568         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 569         uint64_t va;
 570         struct r600_resource *code_bo;
 571         unsigned ngpr, nstack;
 572
 573         code_bo = shader->code_bo;
 574         va = shader->code_bo->gpu_address + state->pc;
 575         ngpr = shader->bc.ngpr;
 576         nstack = shader->bc.nstack;
 577
 578         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 579         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 580         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 581                         S_0288D4_NUM_GPRS(ngpr)
 582                         | S_0288D4_STACK_SIZE(nstack));
 583         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 584
 585         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 586         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 587                                               code_bo, RADEON_USAGE_READ,
 588                                               RADEON_PRIO_SHADER_BINARY));
 589 }
 590
 591 static void evergreen_launch_grid(struct pipe_context *ctx,
 592                                   const struct pipe_grid_info *info)
 593 {
 594         struct r600_context *rctx = (struct r600_context *)ctx;
 595 #ifdef HAVE_OPENCL
 596         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 597         boolean use_kill;
 598
 599         rctx->cs_shader_state.pc = info->pc;
 600         /* Get the config information for this kernel. */
 601         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 602                                   info->pc, &use_kill);
 603 #endif
 604
 605         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 606
 607
 608         evergreen_compute_upload_input(ctx, info);
 609         compute_emit_cs(rctx, info);
 610 }
 611
 612 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 613                                             unsigned start, unsigned count,
 614                                             struct pipe_surface **surfaces)
 615 {
 616         struct r600_context *rctx = (struct r600_context *)ctx;
 617         struct r600_surface **resources = (struct r600_surface **)surfaces;
 618
 619         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 620                         start, count);
 621
 622         for (unsigned i = 0; i < count; i++) {
 623                 /* The First four vertex buffers are reserved for parameters and
 624                  * global buffers. */
 625                 unsigned vtx_id = 4 + i;
 626                 if (resources[i]) {
 627                         struct r600_resource_global *buffer =
 628                                 (struct r600_resource_global*)
 629                                 resources[i]->base.texture;
 630                         if (resources[i]->base.writable) {
 631                                 assert(i+1 < 12);
 632
 633                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 634                                 (struct r600_resource *)resources[i]->base.texture,
 635                                 buffer->chunk->start_in_dw*4,
 636                                 resources[i]->base.texture->width0);
 637                         }
 638
 639                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 640                                         buffer->chunk->start_in_dw * 4,
 641                                         resources[i]->base.texture);
 642                 }
 643         }
 644 }
 645
 646 static void evergreen_set_global_binding(struct pipe_context *ctx,
 647                                          unsigned first, unsigned n,
 648                                          struct pipe_resource **resources,
 649                                          uint32_t **handles)
 650 {
 651         struct r600_context *rctx = (struct r600_context *)ctx;
 652         struct compute_memory_pool *pool = rctx->screen->global_pool;
 653         struct r600_resource_global **buffers =
 654                 (struct r600_resource_global **)resources;
 655         unsigned i;
 656
 657         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 658                         first, n);
 659
 660         if (!resources) {
 661                 /* XXX: Unset */
 662                 return;
 663         }
 664
 665         /* We mark these items for promotion to the pool if they
 666          * aren't already there */
 667         for (i = first; i < first + n; i++) {
 668                 struct compute_memory_item *item = buffers[i]->chunk;
 669
 670                 if (!is_item_in_pool(item))
 671                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 672         }
 673
 674         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 675                 /* XXX: Unset */
 676                 return;
 677         }
 678
 679         for (i = first; i < first + n; i++)
 680         {
 681                 uint32_t buffer_offset;
 682                 uint32_t handle;
 683                 assert(resources[i]->target == PIPE_BUFFER);
 684                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 685
 686                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 687                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 688
 689                 *(handles[i]) = util_cpu_to_le32(handle);
 690         }
 691
 692         /* globals for writing */
 693         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 694         /* globals for reading */
 695         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 696                                 (struct pipe_resource*)pool->bo);
 697
 698         /* constants for reading, LLVM puts them in text segment */
 699         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 700                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 701 }
 702
 703 /**
 704  * This function initializes all the compute specific registers that need to
 705  * be initialized for each compute command stream.  Registers that are common
 706  * to both compute and 3D will be initialized at the beginning of each compute
 707  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 708  * packet requires that the shader type bit be set, we must initialize all
 709  * context registers needed for compute in this function.  The registers
 710  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 711  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 712  * on the GPU family.
 713  */
 714 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 715 {
 716         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 717         int num_threads;
 718         int num_stack_entries;
 719
 720         /* since all required registers are initialized in the
 721          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 722          */
 723         r600_init_command_buffer(cb, 256);
 724         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 725
 726         /* This must be first. */
 727         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 728         r600_store_value(cb, 0x80000000);
 729         r600_store_value(cb, 0x80000000);
 730
 731         /* We're setting config registers here. */
 732         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 733         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 734
 735         switch (rctx->b.family) {
 736         case CHIP_CEDAR:
 737         default:
 738                 num_threads = 128;
 739                 num_stack_entries = 256;
 740                 break;
 741         case CHIP_REDWOOD:
 742                 num_threads = 128;
 743                 num_stack_entries = 256;
 744                 break;
 745         case CHIP_JUNIPER:
 746                 num_threads = 128;
 747                 num_stack_entries = 512;
 748                 break;
 749         case CHIP_CYPRESS:
 750         case CHIP_HEMLOCK:
 751                 num_threads = 128;
 752                 num_stack_entries = 512;
 753                 break;
 754         case CHIP_PALM:
 755                 num_threads = 128;
 756                 num_stack_entries = 256;
 757                 break;
 758         case CHIP_SUMO:
 759                 num_threads = 128;
 760                 num_stack_entries = 256;
 761                 break;
 762         case CHIP_SUMO2:
 763                 num_threads = 128;
 764                 num_stack_entries = 512;
 765                 break;
 766         case CHIP_BARTS:
 767                 num_threads = 128;
 768                 num_stack_entries = 512;
 769                 break;
 770         case CHIP_TURKS:
 771                 num_threads = 128;
 772                 num_stack_entries = 256;
 773                 break;
 774         case CHIP_CAICOS:
 775                 num_threads = 128;
 776                 num_stack_entries = 256;
 777                 break;
 778         }
 779
 780         /* Config Registers */
 781         if (rctx->b.chip_class < CAYMAN)
 782                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 783                                            rctx->screen->b.info.drm_minor);
 784         else
 785                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 786                                         rctx->screen->b.info.drm_minor);
 787
 788         /* The primitive type always needs to be POINTLIST for compute. */
 789         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 790                                                 V_008958_DI_PT_POINTLIST);
 791
 792         if (rctx->b.chip_class < CAYMAN) {
 793
 794                 /* These registers control which simds can be used by each stage.
 795                  * The default for these registers is 0xffffffff, which means
 796                  * all simds are available for each stage.  It's possible we may
 797                  * want to play around with these in the future, but for now
 798                  * the default value is fine.
 799                  *
 800                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 801                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 802                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 803                  */
 804
 805                 /* XXX: We may need to adjust the thread and stack resource
 806                  * values for 3D/compute interop */
 807
 808                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 809
 810                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 811                  * Set the number of threads used by the PS/VS/GS/ES stage to
 812                  * 0.
 813                  */
 814                 r600_store_value(cb, 0);
 815
 816                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 817                  * Set the number of threads used by the CS (aka LS) stage to
 818                  * the maximum number of threads and set the number of threads
 819                  * for the HS stage to 0. */
 820                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 821
 822                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 823                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 824                 r600_store_value(cb, 0);
 825
 826                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 827                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 828                 r600_store_value(cb, 0);
 829
 830                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 831                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 832                  * set it to the maximum value for the CS (aka LS) stage. */
 833                 r600_store_value(cb,
 834                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 835         }
 836         /* Give the compute shader all the available LDS space.
 837          * NOTE: This only sets the maximum number of dwords that a compute
 838          * shader can allocate.  When a shader is executed, we still need to
 839          * allocate the appropriate amount of LDS dwords using the
 840          * CM_R_0288E8_SQ_LDS_ALLOC register.
 841          */
 842         if (rctx->b.chip_class < CAYMAN) {
 843                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 844                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 845         } else {
 846                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 847                         S_0286FC_NUM_PS_LDS(0) |
 848                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 849         }
 850
 851         /* Context Registers */
 852
 853         if (rctx->b.chip_class < CAYMAN) {
 854                 /* workaround for hw issues with dyn gpr - must set all limits
 855                  * to 240 instead of 0, 0x1e == 240 / 8
 856                  */
 857                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 858                                 S_028838_PS_GPRS(0x1e) |
 859                                 S_028838_VS_GPRS(0x1e) |
 860                                 S_028838_GS_GPRS(0x1e) |
 861                                 S_028838_ES_GPRS(0x1e) |
 862                                 S_028838_HS_GPRS(0x1e) |
 863                                 S_028838_LS_GPRS(0x1e));
 864         }
 865
 866         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 867         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 868                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 869
 870         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 871
 872         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 873                                S_0286E8_TID_IN_GROUP_ENA(1) |
 874                                S_0286E8_TGID_ENA(1) |
 875                                S_0286E8_DISABLE_INDEX_PACK(1));
 876
 877         /* The LOOP_CONST registers are an optimizations for loops that allows
 878          * you to store the initial counter, increment value, and maximum
 879          * counter value in a register so that hardware can calculate the
 880          * correct number of iterations for the loop, so that you don't need
 881          * to have the loop counter in your shader code.  We don't currently use
 882          * this optimization, so we must keep track of the counter in the
 883          * shader and use a break instruction to exit loops.  However, the
 884          * hardware will still uses this register to determine when to exit a
 885          * loop, so we need to initialize the counter to 0, set the increment
 886          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 887          * is the maximum value allowed.  This gives us a maximum of 4096
 888          * iterations for our loops, but hopefully our break instruction will
 889          * execute before some time before the 4096th iteration.
 890          */
 891         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 892 }
 893
 894 void evergreen_init_compute_state_functions(struct r600_context *rctx)
 895 {
 896         rctx->b.b.create_compute_state = evergreen_create_compute_state;
 897         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 898         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 899 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 900         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 901         rctx->b.b.set_global_binding = evergreen_set_global_binding;
 902         rctx->b.b.launch_grid = evergreen_launch_grid;
 903
 904 }
 905
 906 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 907                                               struct pipe_resource *resource,
 908                                               unsigned level,
 909                                               unsigned usage,
 910                                               const struct pipe_box *box,
 911                                               struct pipe_transfer **ptransfer)
 912 {
 913         struct r600_context *rctx = (struct r600_context*)ctx;
 914         struct compute_memory_pool *pool = rctx->screen->global_pool;
 915         struct r600_resource_global* buffer =
 916                 (struct r600_resource_global*)resource;
 917
 918         struct compute_memory_item *item = buffer->chunk;
 919         struct pipe_resource *dst = NULL;
 920         unsigned offset = box->x;
 921
 922         if (is_item_in_pool(item)) {
 923                 compute_memory_demote_item(pool, item, ctx);
 924         }
 925         else {
 926                 if (item->real_buffer == NULL) {
 927                         item->real_buffer =
 928                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 929                 }
 930         }
 931
 932         dst = (struct pipe_resource*)item->real_buffer;
 933
 934         if (usage & PIPE_TRANSFER_READ)
 935                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 936
 937         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 938                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 939                         "width = %u, height = %u, depth = %u)\n", level, usage,
 940                         box->x, box->y, box->z, box->width, box->height,
 941                         box->depth);
 942         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 943                 "%u (box.x)\n", item->id, box->x);
 944
 945
 946         assert(resource->target == PIPE_BUFFER);
 947         assert(resource->bind & PIPE_BIND_GLOBAL);
 948         assert(box->x >= 0);
 949         assert(box->y == 0);
 950         assert(box->z == 0);
 951
 952         ///TODO: do it better, mapping is not possible if the pool is too big
 953         return pipe_buffer_map_range(ctx, dst,
 954                         offset, box->width, usage, ptransfer);
 955 }
 956
 957 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 958                                                struct pipe_transfer *transfer)
 959 {
 960         /* struct r600_resource_global are not real resources, they just map
 961          * to an offset within the compute memory pool.  The function
 962          * r600_compute_global_transfer_map() maps the memory pool
 963          * resource rather than the struct r600_resource_global passed to
 964          * it as an argument and then initalizes ptransfer->resource with
 965          * the memory pool resource (via pipe_buffer_map_range).
 966          * When transfer_unmap is called it uses the memory pool's
 967          * vtable which calls r600_buffer_transfer_map() rather than
 968          * this function.
 969          */
 970         assert (!"This function should not be called");
 971 }
 972
 973 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
 974                                                       struct pipe_transfer *transfer,
 975                                                       const struct pipe_box *box)
 976 {
 977         assert(0 && "TODO");
 978 }
 979
 980 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 981                                                struct pipe_resource *res)
 982 {
 983         struct r600_resource_global* buffer = NULL;
 984         struct r600_screen* rscreen = NULL;
 985
 986         assert(res->target == PIPE_BUFFER);
 987         assert(res->bind & PIPE_BIND_GLOBAL);
 988
 989         buffer = (struct r600_resource_global*)res;
 990         rscreen = (struct r600_screen*)screen;
 991
 992         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 993
 994         buffer->chunk = NULL;
 995         free(res);
 996 }
 997
 998 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 999 {
1000         u_default_resource_get_handle, /* get_handle */
1001         r600_compute_global_buffer_destroy, /* resource_destroy */
1002         r600_compute_global_transfer_map, /* transfer_map */
1003         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1004         r600_compute_global_transfer_unmap, /* transfer_unmap */
1005 };
1006
1007 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1008                                                         const struct pipe_resource *templ)
1009 {
1010         struct r600_resource_global* result = NULL;
1011         struct r600_screen* rscreen = NULL;
1012         int size_in_dw = 0;
1013
1014         assert(templ->target == PIPE_BUFFER);
1015         assert(templ->bind & PIPE_BIND_GLOBAL);
1016         assert(templ->array_size == 1 || templ->array_size == 0);
1017         assert(templ->depth0 == 1 || templ->depth0 == 0);
1018         assert(templ->height0 == 1 || templ->height0 == 0);
1019
1020         result = (struct r600_resource_global*)
1021         CALLOC(sizeof(struct r600_resource_global), 1);
1022         rscreen = (struct r600_screen*)screen;
1023
1024         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1025         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1026                         templ->array_size);
1027
1028         result->base.b.vtbl = &r600_global_buffer_vtbl;
1029         result->base.b.b = *templ;
1030         result->base.b.b.screen = screen;
1031         pipe_reference_init(&result->base.b.b.reference, 1);
1032
1033         size_in_dw = (templ->width0+3) / 4;
1034
1035         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1036
1037         if (result->chunk == NULL)
1038         {
1039                 free(result);
1040                 return NULL;
1041         }
1042
1043         return &result->base.b.b;
1044 }