src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  87                                                      unsigned size)
  88 {
  89         struct pipe_resource *buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  93                                     PIPE_BIND_CUSTOM,
  94                                     PIPE_USAGE_IMMUTABLE,
  95                                     size);
  96
  97         return (struct r600_resource *)buffer;
  98 }
  99
 100
 101 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 102                               unsigned id,
 103                               struct r600_resource *bo,
 104                               int start,
 105                               int size)
 106 {
 107         struct pipe_surface rat_templ;
 108         struct r600_surface *surf = NULL;
 109         struct r600_context *rctx = NULL;
 110
 111         assert(id < 12);
 112         assert((size & 3) == 0);
 113         assert((start & 0xFF) == 0);
 114
 115         rctx = pipe->ctx;
 116
 117         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 118
 119         /* Create the RAT surface */
 120         memset(&rat_templ, 0, sizeof(rat_templ));
 121         rat_templ.format = PIPE_FORMAT_R32_UINT;
 122         rat_templ.u.tex.level = 0;
 123         rat_templ.u.tex.first_layer = 0;
 124         rat_templ.u.tex.last_layer = 0;
 125
 126         /* Add the RAT the list of color buffers */
 127         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 128                 (struct pipe_context *)pipe->ctx,
 129                 (struct pipe_resource *)bo, &rat_templ);
 130
 131         /* Update the number of color buffers */
 132         pipe->ctx->framebuffer.state.nr_cbufs =
 133                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 134
 135         /* Update the cb_target_mask
 136          * XXX: I think this is a potential spot for bugs once we start doing
 137          * GL interop.  cb_target_mask may be modified in the 3D sections
 138          * of this driver. */
 139         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 140
 141         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 142         evergreen_init_color_surface_rat(rctx, surf);
 143 }
 144
 145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 146                                            unsigned vb_index,
 147                                            unsigned offset,
 148                                            struct pipe_resource *buffer)
 149 {
 150         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 151         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 152         vb->stride = 1;
 153         vb->buffer_offset = offset;
 154         vb->buffer = buffer;
 155         vb->user_buffer = NULL;
 156
 157         /* The vertex instructions in the compute shaders use the texture cache,
 158          * so we need to invalidate it. */
 159         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 160         state->enabled_mask |= 1 << vb_index;
 161         state->dirty_mask |= 1 << vb_index;
 162         r600_mark_atom_dirty(rctx, &state->atom);
 163 }
 164
 165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 166                                              unsigned cb_index,
 167                                              unsigned offset,
 168                                              unsigned size,
 169                                              struct pipe_resource *buffer)
 170 {
 171         struct pipe_constant_buffer cb;
 172         cb.buffer_size = size;
 173         cb.buffer_offset = offset;
 174         cb.buffer = buffer;
 175         cb.user_buffer = NULL;
 176
 177         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 178 }
 179
 180 /* We need to define these R600 registers here, because we can't include
 181  * evergreend.h and r600d.h.
 182  */
 183 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 184 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 185
 186 #ifdef HAVE_OPENCL
 187
 188 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
 189                                            struct r600_bytecode *bc,
 190                                            uint64_t symbol_offset,
 191                                            boolean *use_kill)
 192 {
 193        unsigned i;
 194        const unsigned char *config =
 195                radeon_shader_binary_config_start(binary, symbol_offset);
 196
 197        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 198                unsigned reg =
 199                        util_le32_to_cpu(*(uint32_t*)(config + i));
 200                unsigned value =
 201                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 202                switch (reg) {
 203                /* R600 / R700 */
 204                case R_028850_SQ_PGM_RESOURCES_PS:
 205                case R_028868_SQ_PGM_RESOURCES_VS:
 206                /* Evergreen / Northern Islands */
 207                case R_028844_SQ_PGM_RESOURCES_PS:
 208                case R_028860_SQ_PGM_RESOURCES_VS:
 209                case R_0288D4_SQ_PGM_RESOURCES_LS:
 210                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 211                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 212                        break;
 213                case R_02880C_DB_SHADER_CONTROL:
 214                        *use_kill = G_02880C_KILL_ENABLE(value);
 215                        break;
 216                case R_0288E8_SQ_LDS_ALLOC:
 217                        bc->nlds_dw = value;
 218                        break;
 219                }
 220        }
 221 }
 222
 223 static unsigned r600_create_shader(struct r600_bytecode *bc,
 224                                    const struct radeon_shader_binary *binary,
 225                                    boolean *use_kill)
 226
 227 {
 228         assert(binary->code_size % 4 == 0);
 229         bc->bytecode = CALLOC(1, binary->code_size);
 230         memcpy(bc->bytecode, binary->code, binary->code_size);
 231         bc->ndw = binary->code_size / 4;
 232
 233         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 234         return 0;
 235 }
 236
 237 #endif
 238
 239 static void r600_destroy_shader(struct r600_bytecode *bc)
 240 {
 241         FREE(bc->bytecode);
 242 }
 243
 244 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 245                                             const const struct pipe_compute_state *cso)
 246 {
 247         struct r600_context *rctx = (struct r600_context *)ctx;
 248         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 249 #ifdef HAVE_OPENCL
 250         const struct pipe_llvm_program_header *header;
 251         const char *code;
 252         void *p;
 253         boolean use_kill;
 254
 255         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 256         header = cso->prog;
 257         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 258         radeon_shader_binary_init(&shader->binary);
 259         radeon_elf_read(code, header->num_bytes, &shader->binary);
 260         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 261
 262         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 263                                                         shader->bc.ndw * 4);
 264         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 265         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 266         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 267 #endif
 268
 269         shader->ctx = rctx;
 270         shader->local_size = cso->req_local_mem;
 271         shader->private_size = cso->req_private_mem;
 272         shader->input_size = cso->req_input_mem;
 273
 274         return shader;
 275 }
 276
 277 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 278 {
 279         struct r600_context *rctx = (struct r600_context *)ctx;
 280         struct r600_pipe_compute *shader = state;
 281
 282         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 283
 284         if (!shader)
 285                 return;
 286
 287         radeon_shader_binary_clean(&shader->binary);
 288         r600_destroy_shader(&shader->bc);
 289
 290         /* TODO destroy shader->code_bo, shader->const_bo
 291          * we'll need something like r600_buffer_free */
 292         FREE(shader);
 293 }
 294
 295 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 296 {
 297         struct r600_context *rctx = (struct r600_context *)ctx;
 298
 299         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 300
 301         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 302 }
 303
 304 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 305  * kernel parameters there are implicit parameters that need to be stored
 306  * in the vertex buffer as well.  Here is how these parameters are organized in
 307  * the buffer:
 308  *
 309  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 310  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 311  * DWORDS 6-8: Number of work items within each work group in each dimension
 312  *             (x,y,z)
 313  * DWORDS 9+ : Kernel parameters
 314  */
 315 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 316                                            const struct pipe_grid_info *info)
 317 {
 318         struct r600_context *rctx = (struct r600_context *)ctx;
 319         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 320         unsigned i;
 321         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 322          * parameters.
 323          */
 324         unsigned input_size = shader->input_size + 36;
 325         uint32_t *num_work_groups_start;
 326         uint32_t *global_size_start;
 327         uint32_t *local_size_start;
 328         uint32_t *kernel_parameters_start;
 329         struct pipe_box box;
 330         struct pipe_transfer *transfer = NULL;
 331
 332         if (shader->input_size == 0) {
 333                 return;
 334         }
 335
 336         if (!shader->kernel_param) {
 337                 /* Add space for the grid dimensions */
 338                 shader->kernel_param = (struct r600_resource *)
 339                         pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
 340                                         PIPE_USAGE_IMMUTABLE, input_size);
 341         }
 342
 343         u_box_1d(0, input_size, &box);
 344         num_work_groups_start = ctx->transfer_map(ctx,
 345                         (struct pipe_resource*)shader->kernel_param,
 346                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 347                         &box, &transfer);
 348         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 349         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 350         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 351
 352         /* Copy the work group size */
 353         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 354
 355         /* Copy the global size */
 356         for (i = 0; i < 3; i++) {
 357                 global_size_start[i] = info->grid[i] * info->block[i];
 358         }
 359
 360         /* Copy the local dimensions */
 361         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 362
 363         /* Copy the kernel inputs */
 364         memcpy(kernel_parameters_start, info->input, shader->input_size);
 365
 366         for (i = 0; i < (input_size / 4); i++) {
 367                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 368                         ((unsigned*)num_work_groups_start)[i]);
 369         }
 370
 371         ctx->transfer_unmap(ctx, transfer);
 372
 373         /* ID=0 is reserved for the parameters */
 374         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 375                         (struct pipe_resource*)shader->kernel_param);
 376 }
 377
 378 static void evergreen_emit_dispatch(struct r600_context *rctx,
 379                                     const struct pipe_grid_info *info)
 380 {
 381         int i;
 382         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 383         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 384         unsigned num_waves;
 385         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 386         unsigned wave_divisor = (16 * num_pipes);
 387         int group_size = 1;
 388         int grid_size = 1;
 389         unsigned lds_size = shader->local_size / 4 +
 390                 shader->bc.nlds_dw;
 391
 392
 393         /* Calculate group_size/grid_size */
 394         for (i = 0; i < 3; i++) {
 395                 group_size *= info->block[i];
 396         }
 397
 398         for (i = 0; i < 3; i++) {
 399                 grid_size *= info->grid[i];
 400         }
 401
 402         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 403         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 404                         wave_divisor - 1) / wave_divisor;
 405
 406         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 407                                 "%u wavefronts per thread block, "
 408                                 "allocating %u dwords lds.\n",
 409                                 num_pipes, num_waves, lds_size);
 410
 411         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 412
 413         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 414         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 415         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 416         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 417
 418         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 419                                                                 group_size);
 420
 421         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 422         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 423         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 424         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 425
 426         if (rctx->b.chip_class < CAYMAN) {
 427                 assert(lds_size <= 8192);
 428         } else {
 429                 /* Cayman appears to have a slightly smaller limit, see the
 430                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 431                 assert(lds_size <= 8160);
 432         }
 433
 434         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 435                                         lds_size | (num_waves << 14));
 436
 437         /* Dispatch packet */
 438         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 439         radeon_emit(cs, info->grid[0]);
 440         radeon_emit(cs, info->grid[1]);
 441         radeon_emit(cs, info->grid[2]);
 442         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 443         radeon_emit(cs, 1);
 444 }
 445
 446 static void compute_emit_cs(struct r600_context *rctx,
 447                             const struct pipe_grid_info *info)
 448 {
 449         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 450         unsigned i;
 451
 452         /* make sure that the gfx ring is only one active */
 453         if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
 454                 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 455         }
 456
 457         /* Initialize all the compute-related registers.
 458          *
 459          * See evergreen_init_atom_start_compute_cs() in this file for the list
 460          * of registers initialized by the start_compute_cs_cmd atom.
 461          */
 462         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 463
 464         /* emit config state */
 465         if (rctx->b.chip_class == EVERGREEN)
 466                 r600_emit_atom(rctx, &rctx->config_state.atom);
 467
 468         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 469         r600_flush_emit(rctx);
 470
 471         /* Emit colorbuffers. */
 472         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 473         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 474                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 475                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 476                                                        (struct r600_resource*)cb->base.texture,
 477                                                        RADEON_USAGE_READWRITE,
 478                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 479
 480                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 481                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 482                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 483                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 484                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 485                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 486                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 487                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 488
 489                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 490                 radeon_emit(cs, reloc);
 491
 492                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 493                 radeon_emit(cs, reloc);
 494         }
 495         for (; i < 8 ; i++)
 496                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 497                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 498         for (; i < 12; i++)
 499                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 500                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 501
 502         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 503         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 504                                         rctx->compute_cb_target_mask);
 505
 506
 507         /* Emit vertex buffer state */
 508         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 509         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 510
 511         /* Emit constant buffer state */
 512         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 513
 514         /* Emit sampler state */
 515         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 516
 517         /* Emit sampler view (texture resource) state */
 518         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 519
 520         /* Emit compute shader state */
 521         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 522
 523         /* Emit dispatch state and dispatch packet */
 524         evergreen_emit_dispatch(rctx, info);
 525
 526         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 527          */
 528         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 529                       R600_CONTEXT_INV_VERTEX_CACHE |
 530                       R600_CONTEXT_INV_TEX_CACHE;
 531         r600_flush_emit(rctx);
 532         rctx->b.flags = 0;
 533
 534         if (rctx->b.chip_class >= CAYMAN) {
 535                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 536                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 537                 /* DEALLOC_STATE prevents the GPU from hanging when a
 538                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 539                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 540                  */
 541                 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
 542                 radeon_emit(cs, 0);
 543         }
 544
 545 #if 0
 546         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 547         for (i = 0; i < cs->cdw; i++) {
 548                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 549         }
 550 #endif
 551
 552 }
 553
 554
 555 /**
 556  * Emit function for r600_cs_shader_state atom
 557  */
 558 void evergreen_emit_cs_shader(struct r600_context *rctx,
 559                               struct r600_atom *atom)
 560 {
 561         struct r600_cs_shader_state *state =
 562                                         (struct r600_cs_shader_state*)atom;
 563         struct r600_pipe_compute *shader = state->shader;
 564         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 565         uint64_t va;
 566         struct r600_resource *code_bo;
 567         unsigned ngpr, nstack;
 568
 569         code_bo = shader->code_bo;
 570         va = shader->code_bo->gpu_address + state->pc;
 571         ngpr = shader->bc.ngpr;
 572         nstack = shader->bc.nstack;
 573
 574         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 575         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 576         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 577                         S_0288D4_NUM_GPRS(ngpr)
 578                         | S_0288D4_STACK_SIZE(nstack));
 579         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 580
 581         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 582         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 583                                               code_bo, RADEON_USAGE_READ,
 584                                               RADEON_PRIO_USER_SHADER));
 585 }
 586
 587 static void evergreen_launch_grid(struct pipe_context *ctx,
 588                                   const struct pipe_grid_info *info)
 589 {
 590         struct r600_context *rctx = (struct r600_context *)ctx;
 591 #ifdef HAVE_OPENCL
 592         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 593         boolean use_kill;
 594
 595         rctx->cs_shader_state.pc = info->pc;
 596         /* Get the config information for this kernel. */
 597         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 598                                   info->pc, &use_kill);
 599 #endif
 600
 601         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 602
 603
 604         evergreen_compute_upload_input(ctx, info);
 605         compute_emit_cs(rctx, info);
 606 }
 607
 608 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 609                                             unsigned start, unsigned count,
 610                                             struct pipe_surface **surfaces)
 611 {
 612         struct r600_context *rctx = (struct r600_context *)ctx;
 613         struct r600_surface **resources = (struct r600_surface **)surfaces;
 614
 615         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 616                         start, count);
 617
 618         for (unsigned i = 0; i < count; i++) {
 619                 /* The First two vertex buffers are reserved for parameters and
 620                  * global buffers. */
 621                 unsigned vtx_id = 2 + i;
 622                 if (resources[i]) {
 623                         struct r600_resource_global *buffer =
 624                                 (struct r600_resource_global*)
 625                                 resources[i]->base.texture;
 626                         if (resources[i]->base.writable) {
 627                                 assert(i+1 < 12);
 628
 629                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 630                                 (struct r600_resource *)resources[i]->base.texture,
 631                                 buffer->chunk->start_in_dw*4,
 632                                 resources[i]->base.texture->width0);
 633                         }
 634
 635                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 636                                         buffer->chunk->start_in_dw * 4,
 637                                         resources[i]->base.texture);
 638                 }
 639         }
 640 }
 641
 642 static void evergreen_set_global_binding(struct pipe_context *ctx,
 643                                          unsigned first, unsigned n,
 644                                          struct pipe_resource **resources,
 645                                          uint32_t **handles)
 646 {
 647         struct r600_context *rctx = (struct r600_context *)ctx;
 648         struct compute_memory_pool *pool = rctx->screen->global_pool;
 649         struct r600_resource_global **buffers =
 650                 (struct r600_resource_global **)resources;
 651         unsigned i;
 652
 653         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 654                         first, n);
 655
 656         if (!resources) {
 657                 /* XXX: Unset */
 658                 return;
 659         }
 660
 661         /* We mark these items for promotion to the pool if they
 662          * aren't already there */
 663         for (i = first; i < first + n; i++) {
 664                 struct compute_memory_item *item = buffers[i]->chunk;
 665
 666                 if (!is_item_in_pool(item))
 667                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 668         }
 669
 670         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 671                 /* XXX: Unset */
 672                 return;
 673         }
 674
 675         for (i = first; i < first + n; i++)
 676         {
 677                 uint32_t buffer_offset;
 678                 uint32_t handle;
 679                 assert(resources[i]->target == PIPE_BUFFER);
 680                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 681
 682                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 683                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 684
 685                 *(handles[i]) = util_cpu_to_le32(handle);
 686         }
 687
 688         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 689         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 690                                 (struct pipe_resource*)pool->bo);
 691 }
 692
 693 /**
 694  * This function initializes all the compute specific registers that need to
 695  * be initialized for each compute command stream.  Registers that are common
 696  * to both compute and 3D will be initialized at the beginning of each compute
 697  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 698  * packet requires that the shader type bit be set, we must initialize all
 699  * context registers needed for compute in this function.  The registers
 700  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 701  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 702  * on the GPU family.
 703  */
 704 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 705 {
 706         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 707         int num_threads;
 708         int num_stack_entries;
 709
 710         /* since all required registers are initialized in the
 711          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 712          */
 713         r600_init_command_buffer(cb, 256);
 714         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 715
 716         /* This must be first. */
 717         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 718         r600_store_value(cb, 0x80000000);
 719         r600_store_value(cb, 0x80000000);
 720
 721         /* We're setting config registers here. */
 722         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 723         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 724
 725         switch (rctx->b.family) {
 726         case CHIP_CEDAR:
 727         default:
 728                 num_threads = 128;
 729                 num_stack_entries = 256;
 730                 break;
 731         case CHIP_REDWOOD:
 732                 num_threads = 128;
 733                 num_stack_entries = 256;
 734                 break;
 735         case CHIP_JUNIPER:
 736                 num_threads = 128;
 737                 num_stack_entries = 512;
 738                 break;
 739         case CHIP_CYPRESS:
 740         case CHIP_HEMLOCK:
 741                 num_threads = 128;
 742                 num_stack_entries = 512;
 743                 break;
 744         case CHIP_PALM:
 745                 num_threads = 128;
 746                 num_stack_entries = 256;
 747                 break;
 748         case CHIP_SUMO:
 749                 num_threads = 128;
 750                 num_stack_entries = 256;
 751                 break;
 752         case CHIP_SUMO2:
 753                 num_threads = 128;
 754                 num_stack_entries = 512;
 755                 break;
 756         case CHIP_BARTS:
 757                 num_threads = 128;
 758                 num_stack_entries = 512;
 759                 break;
 760         case CHIP_TURKS:
 761                 num_threads = 128;
 762                 num_stack_entries = 256;
 763                 break;
 764         case CHIP_CAICOS:
 765                 num_threads = 128;
 766                 num_stack_entries = 256;
 767                 break;
 768         }
 769
 770         /* Config Registers */
 771         if (rctx->b.chip_class < CAYMAN)
 772                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 773                                            rctx->screen->b.info.drm_minor);
 774         else
 775                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 776                                         rctx->screen->b.info.drm_minor);
 777
 778         /* The primitive type always needs to be POINTLIST for compute. */
 779         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 780                                                 V_008958_DI_PT_POINTLIST);
 781
 782         if (rctx->b.chip_class < CAYMAN) {
 783
 784                 /* These registers control which simds can be used by each stage.
 785                  * The default for these registers is 0xffffffff, which means
 786                  * all simds are available for each stage.  It's possible we may
 787                  * want to play around with these in the future, but for now
 788                  * the default value is fine.
 789                  *
 790                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 791                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 792                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 793                  */
 794
 795                 /* XXX: We may need to adjust the thread and stack resource
 796                  * values for 3D/compute interop */
 797
 798                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 799
 800                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 801                  * Set the number of threads used by the PS/VS/GS/ES stage to
 802                  * 0.
 803                  */
 804                 r600_store_value(cb, 0);
 805
 806                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 807                  * Set the number of threads used by the CS (aka LS) stage to
 808                  * the maximum number of threads and set the number of threads
 809                  * for the HS stage to 0. */
 810                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 811
 812                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 813                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 814                 r600_store_value(cb, 0);
 815
 816                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 817                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 818                 r600_store_value(cb, 0);
 819
 820                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 821                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 822                  * set it to the maximum value for the CS (aka LS) stage. */
 823                 r600_store_value(cb,
 824                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 825         }
 826         /* Give the compute shader all the available LDS space.
 827          * NOTE: This only sets the maximum number of dwords that a compute
 828          * shader can allocate.  When a shader is executed, we still need to
 829          * allocate the appropriate amount of LDS dwords using the
 830          * CM_R_0288E8_SQ_LDS_ALLOC register.
 831          */
 832         if (rctx->b.chip_class < CAYMAN) {
 833                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 834                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 835         } else {
 836                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 837                         S_0286FC_NUM_PS_LDS(0) |
 838                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 839         }
 840
 841         /* Context Registers */
 842
 843         if (rctx->b.chip_class < CAYMAN) {
 844                 /* workaround for hw issues with dyn gpr - must set all limits
 845                  * to 240 instead of 0, 0x1e == 240 / 8
 846                  */
 847                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 848                                 S_028838_PS_GPRS(0x1e) |
 849                                 S_028838_VS_GPRS(0x1e) |
 850                                 S_028838_GS_GPRS(0x1e) |
 851                                 S_028838_ES_GPRS(0x1e) |
 852                                 S_028838_HS_GPRS(0x1e) |
 853                                 S_028838_LS_GPRS(0x1e));
 854         }
 855
 856         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 857         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 858                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 859
 860         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 861
 862         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 863                                                 S_0286E8_TID_IN_GROUP_ENA
 864                                                 | S_0286E8_TGID_ENA
 865                                                 | S_0286E8_DISABLE_INDEX_PACK)
 866                                                 ;
 867
 868         /* The LOOP_CONST registers are an optimizations for loops that allows
 869          * you to store the initial counter, increment value, and maximum
 870          * counter value in a register so that hardware can calculate the
 871          * correct number of iterations for the loop, so that you don't need
 872          * to have the loop counter in your shader code.  We don't currently use
 873          * this optimization, so we must keep track of the counter in the
 874          * shader and use a break instruction to exit loops.  However, the
 875          * hardware will still uses this register to determine when to exit a
 876          * loop, so we need to initialize the counter to 0, set the increment
 877          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 878          * is the maximum value allowed.  This gives us a maximum of 4096
 879          * iterations for our loops, but hopefully our break instruction will
 880          * execute before some time before the 4096th iteration.
 881          */
 882         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 883 }
 884
 885 void evergreen_init_compute_state_functions(struct r600_context *rctx)
 886 {
 887         rctx->b.b.create_compute_state = evergreen_create_compute_state;
 888         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 889         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 890 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 891         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 892         rctx->b.b.set_global_binding = evergreen_set_global_binding;
 893         rctx->b.b.launch_grid = evergreen_launch_grid;
 894
 895 }
 896
 897 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 898                                               struct pipe_resource *resource,
 899                                               unsigned level,
 900                                               unsigned usage,
 901                                               const struct pipe_box *box,
 902                                               struct pipe_transfer **ptransfer)
 903 {
 904         struct r600_context *rctx = (struct r600_context*)ctx;
 905         struct compute_memory_pool *pool = rctx->screen->global_pool;
 906         struct r600_resource_global* buffer =
 907                 (struct r600_resource_global*)resource;
 908
 909         struct compute_memory_item *item = buffer->chunk;
 910         struct pipe_resource *dst = NULL;
 911         unsigned offset = box->x;
 912
 913         if (is_item_in_pool(item)) {
 914                 compute_memory_demote_item(pool, item, ctx);
 915         }
 916         else {
 917                 if (item->real_buffer == NULL) {
 918                         item->real_buffer =
 919                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 920                 }
 921         }
 922
 923         dst = (struct pipe_resource*)item->real_buffer;
 924
 925         if (usage & PIPE_TRANSFER_READ)
 926                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 927
 928         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 929                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 930                         "width = %u, height = %u, depth = %u)\n", level, usage,
 931                         box->x, box->y, box->z, box->width, box->height,
 932                         box->depth);
 933         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
 934                 "%u (box.x)\n", item->id, box->x);
 935
 936
 937         assert(resource->target == PIPE_BUFFER);
 938         assert(resource->bind & PIPE_BIND_GLOBAL);
 939         assert(box->x >= 0);
 940         assert(box->y == 0);
 941         assert(box->z == 0);
 942
 943         ///TODO: do it better, mapping is not possible if the pool is too big
 944         return pipe_buffer_map_range(ctx, dst,
 945                         offset, box->width, usage, ptransfer);
 946 }
 947
 948 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 949                                                struct pipe_transfer *transfer)
 950 {
 951         /* struct r600_resource_global are not real resources, they just map
 952          * to an offset within the compute memory pool.  The function
 953          * r600_compute_global_transfer_map() maps the memory pool
 954          * resource rather than the struct r600_resource_global passed to
 955          * it as an argument and then initalizes ptransfer->resource with
 956          * the memory pool resource (via pipe_buffer_map_range).
 957          * When transfer_unmap is called it uses the memory pool's
 958          * vtable which calls r600_buffer_transfer_map() rather than
 959          * this function.
 960          */
 961         assert (!"This function should not be called");
 962 }
 963
 964 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
 965                                                       struct pipe_transfer *transfer,
 966                                                       const struct pipe_box *box)
 967 {
 968         assert(0 && "TODO");
 969 }
 970
 971 static void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
 972                                                       struct pipe_resource *resource,
 973                                                       unsigned level,
 974                                                       unsigned usage,
 975                                                       const struct pipe_box *box,
 976                                                       const void *data,
 977                                                       unsigned stride,
 978                                                       unsigned layer_stride)
 979 {
 980         assert(0 && "TODO");
 981 }
 982
 983 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 984                                                struct pipe_resource *res)
 985 {
 986         struct r600_resource_global* buffer = NULL;
 987         struct r600_screen* rscreen = NULL;
 988
 989         assert(res->target == PIPE_BUFFER);
 990         assert(res->bind & PIPE_BIND_GLOBAL);
 991
 992         buffer = (struct r600_resource_global*)res;
 993         rscreen = (struct r600_screen*)screen;
 994
 995         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 996
 997         buffer->chunk = NULL;
 998         free(res);
 999 }
1000
1001 static const struct u_resource_vtbl r600_global_buffer_vtbl =
1002 {
1003         u_default_resource_get_handle, /* get_handle */
1004         r600_compute_global_buffer_destroy, /* resource_destroy */
1005         r600_compute_global_transfer_map, /* transfer_map */
1006         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1007         r600_compute_global_transfer_unmap, /* transfer_unmap */
1008         r600_compute_global_transfer_inline_write /* transfer_inline_write */
1009 };
1010
1011 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1012                                                         const struct pipe_resource *templ)
1013 {
1014         struct r600_resource_global* result = NULL;
1015         struct r600_screen* rscreen = NULL;
1016         int size_in_dw = 0;
1017
1018         assert(templ->target == PIPE_BUFFER);
1019         assert(templ->bind & PIPE_BIND_GLOBAL);
1020         assert(templ->array_size == 1 || templ->array_size == 0);
1021         assert(templ->depth0 == 1 || templ->depth0 == 0);
1022         assert(templ->height0 == 1 || templ->height0 == 0);
1023
1024         result = (struct r600_resource_global*)
1025         CALLOC(sizeof(struct r600_resource_global), 1);
1026         rscreen = (struct r600_screen*)screen;
1027
1028         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1029         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1030                         templ->array_size);
1031
1032         result->base.b.vtbl = &r600_global_buffer_vtbl;
1033         result->base.b.b = *templ;
1034         result->base.b.b.screen = screen;
1035         pipe_reference_init(&result->base.b.b.reference, 1);
1036
1037         size_in_dw = (templ->width0+3) / 4;
1038
1039         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1040
1041         if (result->chunk == NULL)
1042         {
1043                 free(result);
1044                 return NULL;
1045         }
1046
1047         return &result->base.b.b;
1048 }