src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  87                                                      unsigned size)
  88 {
  89         struct pipe_resource *buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  93                                     PIPE_BIND_CUSTOM,
  94                                     PIPE_USAGE_IMMUTABLE,
  95                                     size);
  96
  97         return (struct r600_resource *)buffer;
  98 }
  99
 100
 101 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 102                               unsigned id,
 103                               struct r600_resource *bo,
 104                               int start,
 105                               int size)
 106 {
 107         struct pipe_surface rat_templ;
 108         struct r600_surface *surf = NULL;
 109         struct r600_context *rctx = NULL;
 110
 111         assert(id < 12);
 112         assert((size & 3) == 0);
 113         assert((start & 0xFF) == 0);
 114
 115         rctx = pipe->ctx;
 116
 117         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 118
 119         /* Create the RAT surface */
 120         memset(&rat_templ, 0, sizeof(rat_templ));
 121         rat_templ.format = PIPE_FORMAT_R32_UINT;
 122         rat_templ.u.tex.level = 0;
 123         rat_templ.u.tex.first_layer = 0;
 124         rat_templ.u.tex.last_layer = 0;
 125
 126         /* Add the RAT the list of color buffers */
 127         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 128                 (struct pipe_context *)pipe->ctx,
 129                 (struct pipe_resource *)bo, &rat_templ);
 130
 131         /* Update the number of color buffers */
 132         pipe->ctx->framebuffer.state.nr_cbufs =
 133                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 134
 135         /* Update the cb_target_mask
 136          * XXX: I think this is a potential spot for bugs once we start doing
 137          * GL interop.  cb_target_mask may be modified in the 3D sections
 138          * of this driver. */
 139         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 140
 141         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 142         evergreen_init_color_surface_rat(rctx, surf);
 143 }
 144
 145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 146                                            unsigned vb_index,
 147                                            unsigned offset,
 148                                            struct pipe_resource *buffer)
 149 {
 150         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 151         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 152         vb->stride = 1;
 153         vb->buffer_offset = offset;
 154         vb->buffer = buffer;
 155         vb->user_buffer = NULL;
 156
 157         /* The vertex instructions in the compute shaders use the texture cache,
 158          * so we need to invalidate it. */
 159         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 160         state->enabled_mask |= 1 << vb_index;
 161         state->dirty_mask |= 1 << vb_index;
 162         r600_mark_atom_dirty(rctx, &state->atom);
 163 }
 164
 165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 166                                              unsigned cb_index,
 167                                              unsigned offset,
 168                                              unsigned size,
 169                                              struct pipe_resource *buffer)
 170 {
 171         struct pipe_constant_buffer cb;
 172         cb.buffer_size = size;
 173         cb.buffer_offset = offset;
 174         cb.buffer = buffer;
 175         cb.user_buffer = NULL;
 176
 177         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 178 }
 179
 180 /* We need to define these R600 registers here, because we can't include
 181  * evergreend.h and r600d.h.
 182  */
 183 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 184 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 185
 186 #ifdef HAVE_OPENCL
 187
 188 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
 189                                            struct r600_bytecode *bc,
 190                                            uint64_t symbol_offset,
 191                                            boolean *use_kill)
 192 {
 193        unsigned i;
 194        const unsigned char *config =
 195                radeon_shader_binary_config_start(binary, symbol_offset);
 196
 197        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 198                unsigned reg =
 199                        util_le32_to_cpu(*(uint32_t*)(config + i));
 200                unsigned value =
 201                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 202                switch (reg) {
 203                /* R600 / R700 */
 204                case R_028850_SQ_PGM_RESOURCES_PS:
 205                case R_028868_SQ_PGM_RESOURCES_VS:
 206                /* Evergreen / Northern Islands */
 207                case R_028844_SQ_PGM_RESOURCES_PS:
 208                case R_028860_SQ_PGM_RESOURCES_VS:
 209                case R_0288D4_SQ_PGM_RESOURCES_LS:
 210                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 211                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 212                        break;
 213                case R_02880C_DB_SHADER_CONTROL:
 214                        *use_kill = G_02880C_KILL_ENABLE(value);
 215                        break;
 216                case R_0288E8_SQ_LDS_ALLOC:
 217                        bc->nlds_dw = value;
 218                        break;
 219                }
 220        }
 221 }
 222
 223 static unsigned r600_create_shader(struct r600_bytecode *bc,
 224                                    const struct radeon_shader_binary *binary,
 225                                    boolean *use_kill)
 226
 227 {
 228         assert(binary->code_size % 4 == 0);
 229         bc->bytecode = CALLOC(1, binary->code_size);
 230         memcpy(bc->bytecode, binary->code, binary->code_size);
 231         bc->ndw = binary->code_size / 4;
 232
 233         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 234         return 0;
 235 }
 236
 237 #endif
 238
 239 static void r600_destroy_shader(struct r600_bytecode *bc)
 240 {
 241         FREE(bc->bytecode);
 242 }
 243
 244 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 245                                             const struct pipe_compute_state *cso)
 246 {
 247         struct r600_context *rctx = (struct r600_context *)ctx;
 248         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 249 #ifdef HAVE_OPENCL
 250         const struct pipe_llvm_program_header *header;
 251         const char *code;
 252         void *p;
 253         boolean use_kill;
 254
 255         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 256         header = cso->prog;
 257         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 258         radeon_shader_binary_init(&shader->binary);
 259         radeon_elf_read(code, header->num_bytes, &shader->binary);
 260         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 261
 262         /* Upload code + ROdata */
 263         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 264                                                         shader->bc.ndw * 4);
 265         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 266         //TODO: use util_memcpy_cpu_to_le32 ?
 267         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 268         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 269 #endif
 270
 271         shader->ctx = rctx;
 272         shader->local_size = cso->req_local_mem;
 273         shader->private_size = cso->req_private_mem;
 274         shader->input_size = cso->req_input_mem;
 275
 276         return shader;
 277 }
 278
 279 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 280 {
 281         struct r600_context *rctx = (struct r600_context *)ctx;
 282         struct r600_pipe_compute *shader = state;
 283
 284         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 285
 286         if (!shader)
 287                 return;
 288
 289         radeon_shader_binary_clean(&shader->binary);
 290         r600_destroy_shader(&shader->bc);
 291
 292         /* TODO destroy shader->code_bo, shader->const_bo
 293          * we'll need something like r600_buffer_free */
 294         FREE(shader);
 295 }
 296
 297 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 298 {
 299         struct r600_context *rctx = (struct r600_context *)ctx;
 300
 301         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 302
 303         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 304 }
 305
 306 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 307  * kernel parameters there are implicit parameters that need to be stored
 308  * in the vertex buffer as well.  Here is how these parameters are organized in
 309  * the buffer:
 310  *
 311  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 312  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 313  * DWORDS 6-8: Number of work items within each work group in each dimension
 314  *             (x,y,z)
 315  * DWORDS 9+ : Kernel parameters
 316  */
 317 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 318                                            const struct pipe_grid_info *info)
 319 {
 320         struct r600_context *rctx = (struct r600_context *)ctx;
 321         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 322         unsigned i;
 323         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 324          * parameters.
 325          */
 326         unsigned input_size = shader->input_size + 36;
 327         uint32_t *num_work_groups_start;
 328         uint32_t *global_size_start;
 329         uint32_t *local_size_start;
 330         uint32_t *kernel_parameters_start;
 331         struct pipe_box box;
 332         struct pipe_transfer *transfer = NULL;
 333
 334         if (shader->input_size == 0) {
 335                 return;
 336         }
 337
 338         if (!shader->kernel_param) {
 339                 /* Add space for the grid dimensions */
 340                 shader->kernel_param = (struct r600_resource *)
 341                         pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
 342                                         PIPE_USAGE_IMMUTABLE, input_size);
 343         }
 344
 345         u_box_1d(0, input_size, &box);
 346         num_work_groups_start = ctx->transfer_map(ctx,
 347                         (struct pipe_resource*)shader->kernel_param,
 348                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 349                         &box, &transfer);
 350         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 351         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 352         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 353
 354         /* Copy the work group size */
 355         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 356
 357         /* Copy the global size */
 358         for (i = 0; i < 3; i++) {
 359                 global_size_start[i] = info->grid[i] * info->block[i];
 360         }
 361
 362         /* Copy the local dimensions */
 363         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 364
 365         /* Copy the kernel inputs */
 366         memcpy(kernel_parameters_start, info->input, shader->input_size);
 367
 368         for (i = 0; i < (input_size / 4); i++) {
 369                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 370                         ((unsigned*)num_work_groups_start)[i]);
 371         }
 372
 373         ctx->transfer_unmap(ctx, transfer);
 374
 375         /* ID=0 is reserved for the parameters */
 376         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 377                         (struct pipe_resource*)shader->kernel_param);
 378 }
 379
 380 static void evergreen_emit_dispatch(struct r600_context *rctx,
 381                                     const struct pipe_grid_info *info)
 382 {
 383         int i;
 384         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 385         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 386         unsigned num_waves;
 387         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 388         unsigned wave_divisor = (16 * num_pipes);
 389         int group_size = 1;
 390         int grid_size = 1;
 391         unsigned lds_size = shader->local_size / 4 +
 392                 shader->bc.nlds_dw;
 393
 394
 395         /* Calculate group_size/grid_size */
 396         for (i = 0; i < 3; i++) {
 397                 group_size *= info->block[i];
 398         }
 399
 400         for (i = 0; i < 3; i++) {
 401                 grid_size *= info->grid[i];
 402         }
 403
 404         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 405         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 406                         wave_divisor - 1) / wave_divisor;
 407
 408         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 409                                 "%u wavefronts per thread block, "
 410                                 "allocating %u dwords lds.\n",
 411                                 num_pipes, num_waves, lds_size);
 412
 413         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 414
 415         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 416         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 417         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 418         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 419
 420         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 421                                                                 group_size);
 422
 423         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 424         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 425         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 426         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 427
 428         if (rctx->b.chip_class < CAYMAN) {
 429                 assert(lds_size <= 8192);
 430         } else {
 431                 /* Cayman appears to have a slightly smaller limit, see the
 432                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 433                 assert(lds_size <= 8160);
 434         }
 435
 436         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 437                                         lds_size | (num_waves << 14));
 438
 439         /* Dispatch packet */
 440         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 441         radeon_emit(cs, info->grid[0]);
 442         radeon_emit(cs, info->grid[1]);
 443         radeon_emit(cs, info->grid[2]);
 444         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 445         radeon_emit(cs, 1);
 446 }
 447
 448 static void compute_emit_cs(struct r600_context *rctx,
 449                             const struct pipe_grid_info *info)
 450 {
 451         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 452         unsigned i;
 453
 454         /* make sure that the gfx ring is only one active */
 455         if (radeon_emitted(rctx->b.dma.cs, 0)) {
 456                 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 457         }
 458
 459         /* Initialize all the compute-related registers.
 460          *
 461          * See evergreen_init_atom_start_compute_cs() in this file for the list
 462          * of registers initialized by the start_compute_cs_cmd atom.
 463          */
 464         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 465
 466         /* emit config state */
 467         if (rctx->b.chip_class == EVERGREEN)
 468                 r600_emit_atom(rctx, &rctx->config_state.atom);
 469
 470         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 471         r600_flush_emit(rctx);
 472
 473         /* Emit colorbuffers. */
 474         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 475         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 476                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 477                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 478                                                        (struct r600_resource*)cb->base.texture,
 479                                                        RADEON_USAGE_READWRITE,
 480                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 481
 482                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 483                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 484                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 485                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 486                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 487                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 488                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 489                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 490
 491                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 492                 radeon_emit(cs, reloc);
 493
 494                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 495                 radeon_emit(cs, reloc);
 496         }
 497         for (; i < 8 ; i++)
 498                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 499                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 500         for (; i < 12; i++)
 501                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 502                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 503
 504         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 505         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 506                                         rctx->compute_cb_target_mask);
 507
 508
 509         /* Emit vertex buffer state */
 510         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 511         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 512
 513         /* Emit constant buffer state */
 514         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 515
 516         /* Emit sampler state */
 517         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 518
 519         /* Emit sampler view (texture resource) state */
 520         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 521
 522         /* Emit compute shader state */
 523         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 524
 525         /* Emit dispatch state and dispatch packet */
 526         evergreen_emit_dispatch(rctx, info);
 527
 528         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 529          */
 530         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 531                       R600_CONTEXT_INV_VERTEX_CACHE |
 532                       R600_CONTEXT_INV_TEX_CACHE;
 533         r600_flush_emit(rctx);
 534         rctx->b.flags = 0;
 535
 536         if (rctx->b.chip_class >= CAYMAN) {
 537                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 538                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 539                 /* DEALLOC_STATE prevents the GPU from hanging when a
 540                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 541                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 542                  */
 543                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 544                 radeon_emit(cs, 0);
 545         }
 546
 547 #if 0
 548         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 549         for (i = 0; i < cs->cdw; i++) {
 550                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 551         }
 552 #endif
 553
 554 }
 555
 556
 557 /**
 558  * Emit function for r600_cs_shader_state atom
 559  */
 560 void evergreen_emit_cs_shader(struct r600_context *rctx,
 561                               struct r600_atom *atom)
 562 {
 563         struct r600_cs_shader_state *state =
 564                                         (struct r600_cs_shader_state*)atom;
 565         struct r600_pipe_compute *shader = state->shader;
 566         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 567         uint64_t va;
 568         struct r600_resource *code_bo;
 569         unsigned ngpr, nstack;
 570
 571         code_bo = shader->code_bo;
 572         va = shader->code_bo->gpu_address + state->pc;
 573         ngpr = shader->bc.ngpr;
 574         nstack = shader->bc.nstack;
 575
 576         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 577         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 578         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 579                         S_0288D4_NUM_GPRS(ngpr)
 580                         | S_0288D4_STACK_SIZE(nstack));
 581         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 582
 583         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 584         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 585                                               code_bo, RADEON_USAGE_READ,
 586                                               RADEON_PRIO_USER_SHADER));
 587 }
 588
 589 static void evergreen_launch_grid(struct pipe_context *ctx,
 590                                   const struct pipe_grid_info *info)
 591 {
 592         struct r600_context *rctx = (struct r600_context *)ctx;
 593 #ifdef HAVE_OPENCL
 594         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 595         boolean use_kill;
 596
 597         rctx->cs_shader_state.pc = info->pc;
 598         /* Get the config information for this kernel. */
 599         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 600                                   info->pc, &use_kill);
 601 #endif
 602
 603         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 604
 605
 606         evergreen_compute_upload_input(ctx, info);
 607         compute_emit_cs(rctx, info);
 608 }
 609
 610 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 611                                             unsigned start, unsigned count,
 612                                             struct pipe_surface **surfaces)
 613 {
 614         struct r600_context *rctx = (struct r600_context *)ctx;
 615         struct r600_surface **resources = (struct r600_surface **)surfaces;
 616
 617         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 618                         start, count);
 619
 620         for (unsigned i = 0; i < count; i++) {
 621                 /* The First three vertex buffers are reserved for parameters and
 622                  * global buffers. */
 623                 unsigned vtx_id = 3 + i;
 624                 if (resources[i]) {
 625                         struct r600_resource_global *buffer =
 626                                 (struct r600_resource_global*)
 627                                 resources[i]->base.texture;
 628                         if (resources[i]->base.writable) {
 629                                 assert(i+1 < 12);
 630
 631                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 632                                 (struct r600_resource *)resources[i]->base.texture,
 633                                 buffer->chunk->start_in_dw*4,
 634                                 resources[i]->base.texture->width0);
 635                         }
 636
 637                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 638                                         buffer->chunk->start_in_dw * 4,
 639                                         resources[i]->base.texture);
 640                 }
 641         }
 642 }
 643
 644 static void evergreen_set_global_binding(struct pipe_context *ctx,
 645                                          unsigned first, unsigned n,
 646                                          struct pipe_resource **resources,
 647                                          uint32_t **handles)
 648 {
 649         struct r600_context *rctx = (struct r600_context *)ctx;
 650         struct compute_memory_pool *pool = rctx->screen->global_pool;
 651         struct r600_resource_global **buffers =
 652                 (struct r600_resource_global **)resources;
 653         unsigned i;
 654
 655         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 656                         first, n);
 657
 658         if (!resources) {
 659                 /* XXX: Unset */
 660                 return;
 661         }
 662
 663         /* We mark these items for promotion to the pool if they
 664          * aren't already there */
 665         for (i = first; i < first + n; i++) {
 666                 struct compute_memory_item *item = buffers[i]->chunk;
 667
 668                 if (!is_item_in_pool(item))
 669                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 670         }
 671
 672         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 673                 /* XXX: Unset */
 674                 return;
 675         }
 676
 677         for (i = first; i < first + n; i++)
 678         {
 679                 uint32_t buffer_offset;
 680                 uint32_t handle;
 681                 assert(resources[i]->target == PIPE_BUFFER);
 682                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 683
 684                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 685                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 686
 687                 *(handles[i]) = util_cpu_to_le32(handle);
 688         }
 689
 690         /* globals for writing */
 691         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 692         /* globals for reading */
 693         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 694                                 (struct pipe_resource*)pool->bo);
 695
 696         /* constants for reading, LLVM puts them in text segment */
 697         evergreen_cs_set_vertex_buffer(rctx, 2, 0,
 698                                 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
 699 }
 700
 701 /**
 702  * This function initializes all the compute specific registers that need to
 703  * be initialized for each compute command stream.  Registers that are common
 704  * to both compute and 3D will be initialized at the beginning of each compute
 705  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 706  * packet requires that the shader type bit be set, we must initialize all
 707  * context registers needed for compute in this function.  The registers
 708  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 709  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 710  * on the GPU family.
 711  */
 712 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 713 {
 714         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 715         int num_threads;
 716         int num_stack_entries;
 717
 718         /* since all required registers are initialized in the
 719          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 720          */
 721         r600_init_command_buffer(cb, 256);
 722         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 723
 724         /* This must be first. */
 725         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 726         r600_store_value(cb, 0x80000000);
 727         r600_store_value(cb, 0x80000000);
 728
 729         /* We're setting config registers here. */
 730         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 731         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 732
 733         switch (rctx->b.family) {
 734         case CHIP_CEDAR:
 735         default:
 736                 num_threads = 128;
 737                 num_stack_entries = 256;
 738                 break;
 739         case CHIP_REDWOOD:
 740                 num_threads = 128;
 741                 num_stack_entries = 256;
 742                 break;
 743         case CHIP_JUNIPER:
 744                 num_threads = 128;
 745                 num_stack_entries = 512;
 746                 break;
 747         case CHIP_CYPRESS:
 748         case CHIP_HEMLOCK:
 749                 num_threads = 128;
 750                 num_stack_entries = 512;
 751                 break;
 752         case CHIP_PALM:
 753                 num_threads = 128;
 754                 num_stack_entries = 256;
 755                 break;
 756         case CHIP_SUMO:
 757                 num_threads = 128;
 758                 num_stack_entries = 256;
 759                 break;
 760         case CHIP_SUMO2:
 761                 num_threads = 128;
 762                 num_stack_entries = 512;
 763                 break;
 764         case CHIP_BARTS:
 765                 num_threads = 128;
 766                 num_stack_entries = 512;
 767                 break;
 768         case CHIP_TURKS:
 769                 num_threads = 128;
 770                 num_stack_entries = 256;
 771                 break;
 772         case CHIP_CAICOS:
 773                 num_threads = 128;
 774                 num_stack_entries = 256;
 775                 break;
 776         }
 777
 778         /* Config Registers */
 779         if (rctx->b.chip_class < CAYMAN)
 780                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 781                                            rctx->screen->b.info.drm_minor);
 782         else
 783                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 784                                         rctx->screen->b.info.drm_minor);
 785
 786         /* The primitive type always needs to be POINTLIST for compute. */
 787         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 788                                                 V_008958_DI_PT_POINTLIST);
 789
 790         if (rctx->b.chip_class < CAYMAN) {
 791
 792                 /* These registers control which simds can be used by each stage.
 793                  * The default for these registers is 0xffffffff, which means
 794                  * all simds are available for each stage.  It's possible we may
 795                  * want to play around with these in the future, but for now
 796                  * the default value is fine.
 797                  *
 798                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 799                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 800                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 801                  */
 802
 803                 /* XXX: We may need to adjust the thread and stack resource
 804                  * values for 3D/compute interop */
 805
 806                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 807
 808                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 809                  * Set the number of threads used by the PS/VS/GS/ES stage to
 810                  * 0.
 811                  */
 812                 r600_store_value(cb, 0);
 813
 814                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 815                  * Set the number of threads used by the CS (aka LS) stage to
 816                  * the maximum number of threads and set the number of threads
 817                  * for the HS stage to 0. */
 818                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 819
 820                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 821                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 822                 r600_store_value(cb, 0);
 823
 824                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 825                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 826                 r600_store_value(cb, 0);
 827
 828                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 829                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 830                  * set it to the maximum value for the CS (aka LS) stage. */
 831                 r600_store_value(cb,
 832                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 833         }
 834         /* Give the compute shader all the available LDS space.
 835          * NOTE: This only sets the maximum number of dwords that a compute
 836          * shader can allocate.  When a shader is executed, we still need to
 837          * allocate the appropriate amount of LDS dwords using the
 838          * CM_R_0288E8_SQ_LDS_ALLOC register.
 839          */
 840         if (rctx->b.chip_class < CAYMAN) {
 841                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 842                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 843         } else {
 844                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 845                         S_0286FC_NUM_PS_LDS(0) |
 846                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 847         }
 848
 849         /* Context Registers */
 850
 851         if (rctx->b.chip_class < CAYMAN) {
 852                 /* workaround for hw issues with dyn gpr - must set all limits
 853                  * to 240 instead of 0, 0x1e == 240 / 8
 854                  */
 855                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 856                                 S_028838_PS_GPRS(0x1e) |
 857                                 S_028838_VS_GPRS(0x1e) |
 858                                 S_028838_GS_GPRS(0x1e) |
 859                                 S_028838_ES_GPRS(0x1e) |
 860                                 S_028838_HS_GPRS(0x1e) |
 861                                 S_028838_LS_GPRS(0x1e));
 862         }
 863
 864         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 865         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 866                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 867
 868         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 869
 870         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 871                                                 S_0286E8_TID_IN_GROUP_ENA
 872                                                 | S_0286E8_TGID_ENA
 873                                                 | S_0286E8_DISABLE_INDEX_PACK)
 874                                                 ;
 875
 876         /* The LOOP_CONST registers are an optimizations for loops that allows
 877          * you to store the initial counter, increment value, and maximum
 878          * counter value in a register so that hardware can calculate the
 879          * correct number of iterations for the loop, so that you don't need
 880          * to have the loop counter in your shader code.  We don't currently use
 881          * this optimization, so we must keep track of the counter in the
 882          * shader and use a break instruction to exit loops.  However, the
 883          * hardware will still uses this register to determine when to exit a
 884          * loop, so we need to initialize the counter to 0, set the increment
 885          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 886          * is the maximum value allowed.  This gives us a maximum of 4096
 887          * iterations for our loops, but hopefully our break instruction will
 888          * execute before some time before the 4096th iteration.
 889          */
 890         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 891 }
 892
 893 void evergreen_init_compute_state_functions(struct r600_context *rctx)
 894 {
 895         rctx->b.b.create_compute_state = evergreen_create_compute_state;
 896         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 897         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 898 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 899         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 900         rctx->b.b.set_global_binding = evergreen_set_global_binding;
 901         rctx->b.b.launch_grid = evergreen_launch_grid;
 902
 903 }
 904
 905 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 906                                               struct pipe_resource *resource,
 907                                               unsigned level,
 908                                               unsigned usage,
 909                                               const struct pipe_box *box,
 910                                               struct pipe_transfer **ptransfer)
 911 {
 912         struct r600_context *rctx = (struct r600_context*)ctx;
 913         struct compute_memory_pool *pool = rctx->screen->global_pool;
 914         struct r600_resource_global* buffer =
 915                 (struct r600_resource_global*)resource;
 916
 917         struct compute_memory_item *item = buffer->chunk;
 918         struct pipe_resource *dst = NULL;
 919         unsigned offset = box->x;
 920
 921         if (is_item_in_pool(item)) {
 922                 compute_memory_demote_item(pool, item, ctx);
 923         }
 924         else {
 925                 if (item->real_buffer == NULL) {
 926                         item->real_buffer =
 927                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 928                 }
 929         }
 930
 931         dst = (struct pipe_resource*)item->real_buffer;
 932
 933         if (usage & PIPE_TRANSFER_READ)
 934                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 935
 936         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 937                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 938                         "width = %u, height = %u, depth = %u)\n", level, usage,
 939                         box->x, box->y, box->z, box->width, box->height,
 940                         box->depth);
 941         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 942                 "%u (box.x)\n", item->id, box->x);
 943
 944
 945         assert(resource->target == PIPE_BUFFER);
 946         assert(resource->bind & PIPE_BIND_GLOBAL);
 947         assert(box->x >= 0);
 948         assert(box->y == 0);
 949         assert(box->z == 0);
 950
 951         ///TODO: do it better, mapping is not possible if the pool is too big
 952         return pipe_buffer_map_range(ctx, dst,
 953                         offset, box->width, usage, ptransfer);
 954 }
 955
 956 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 957                                                struct pipe_transfer *transfer)
 958 {
 959         /* struct r600_resource_global are not real resources, they just map
 960          * to an offset within the compute memory pool.  The function
 961          * r600_compute_global_transfer_map() maps the memory pool
 962          * resource rather than the struct r600_resource_global passed to
 963          * it as an argument and then initalizes ptransfer->resource with
 964          * the memory pool resource (via pipe_buffer_map_range).
 965          * When transfer_unmap is called it uses the memory pool's
 966          * vtable which calls r600_buffer_transfer_map() rather than
 967          * this function.
 968          */
 969         assert (!"This function should not be called");
 970 }
 971
 972 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
 973                                                       struct pipe_transfer *transfer,
 974                                                       const struct pipe_box *box)
 975 {
 976         assert(0 && "TODO");
 977 }
 978
 979 static void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
 980                                                       struct pipe_resource *resource,
 981                                                       unsigned level,
 982                                                       unsigned usage,
 983                                                       const struct pipe_box *box,
 984                                                       const void *data,
 985                                                       unsigned stride,
 986                                                       unsigned layer_stride)
 987 {
 988         assert(0 && "TODO");
 989 }
 990
 991 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 992                                                struct pipe_resource *res)
 993 {
 994         struct r600_resource_global* buffer = NULL;
 995         struct r600_screen* rscreen = NULL;
 996
 997         assert(res->target == PIPE_BUFFER);
 998         assert(res->bind & PIPE_BIND_GLOBAL);
 999
1000         buffer = (struct r600_resource_global*)res;
1001         rscreen = (struct r600_screen*)screen;
1002
1003         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1004
1005         buffer->chunk = NULL;
1006         free(res);
1007 }
1008
1009 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1010 {
1011         u_default_resource_get_handle, /* get_handle */
1012         r600_compute_global_buffer_destroy, /* resource_destroy */
1013         r600_compute_global_transfer_map, /* transfer_map */
1014         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1015         r600_compute_global_transfer_unmap, /* transfer_unmap */
1016         r600_compute_global_transfer_inline_write /* transfer_inline_write */
1017 };
1018
1019 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1020                                                         const struct pipe_resource *templ)
1021 {
1022         struct r600_resource_global* result = NULL;
1023         struct r600_screen* rscreen = NULL;
1024         int size_in_dw = 0;
1025
1026         assert(templ->target == PIPE_BUFFER);
1027         assert(templ->bind & PIPE_BIND_GLOBAL);
1028         assert(templ->array_size == 1 || templ->array_size == 0);
1029         assert(templ->depth0 == 1 || templ->depth0 == 0);
1030         assert(templ->height0 == 1 || templ->height0 == 0);
1031
1032         result = (struct r600_resource_global*)
1033         CALLOC(sizeof(struct r600_resource_global), 1);
1034         rscreen = (struct r600_screen*)screen;
1035
1036         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1037         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1038                         templ->array_size);
1039
1040         result->base.b.vtbl = &r600_global_buffer_vtbl;
1041         result->base.b.b = *templ;
1042         result->base.b.b.screen = screen;
1043         pipe_reference_init(&result->base.b.b.reference, 1);
1044
1045         size_in_dw = (templ->width0+3) / 4;
1046
1047         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1048
1049         if (result->chunk == NULL)
1050         {
1051                 free(result);
1052                 return NULL;
1053         }
1054
1055         return &result->base.b.b;
1056 }