src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
  87                                                      unsigned size)
  88 {
  89         struct pipe_resource *buffer = NULL;
  90         assert(size);
  91
  92         buffer = pipe_buffer_create((struct pipe_screen*) screen,
  93                                     PIPE_BIND_CUSTOM,
  94                                     PIPE_USAGE_IMMUTABLE,
  95                                     size);
  96
  97         return (struct r600_resource *)buffer;
  98 }
  99
 100
 101 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
 102                               unsigned id,
 103                               struct r600_resource *bo,
 104                               int start,
 105                               int size)
 106 {
 107         struct pipe_surface rat_templ;
 108         struct r600_surface *surf = NULL;
 109         struct r600_context *rctx = NULL;
 110
 111         assert(id < 12);
 112         assert((size & 3) == 0);
 113         assert((start & 0xFF) == 0);
 114
 115         rctx = pipe->ctx;
 116
 117         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 118
 119         /* Create the RAT surface */
 120         memset(&rat_templ, 0, sizeof(rat_templ));
 121         rat_templ.format = PIPE_FORMAT_R32_UINT;
 122         rat_templ.u.tex.level = 0;
 123         rat_templ.u.tex.first_layer = 0;
 124         rat_templ.u.tex.last_layer = 0;
 125
 126         /* Add the RAT the list of color buffers */
 127         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 128                 (struct pipe_context *)pipe->ctx,
 129                 (struct pipe_resource *)bo, &rat_templ);
 130
 131         /* Update the number of color buffers */
 132         pipe->ctx->framebuffer.state.nr_cbufs =
 133                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 134
 135         /* Update the cb_target_mask
 136          * XXX: I think this is a potential spot for bugs once we start doing
 137          * GL interop.  cb_target_mask may be modified in the 3D sections
 138          * of this driver. */
 139         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 140
 141         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 142         evergreen_init_color_surface_rat(rctx, surf);
 143 }
 144
 145 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
 146                                            unsigned vb_index,
 147                                            unsigned offset,
 148                                            struct pipe_resource *buffer)
 149 {
 150         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 151         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 152         vb->stride = 1;
 153         vb->buffer_offset = offset;
 154         vb->buffer = buffer;
 155         vb->user_buffer = NULL;
 156
 157         /* The vertex instructions in the compute shaders use the texture cache,
 158          * so we need to invalidate it. */
 159         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 160         state->enabled_mask |= 1 << vb_index;
 161         state->dirty_mask |= 1 << vb_index;
 162         r600_mark_atom_dirty(rctx, &state->atom);
 163 }
 164
 165 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 166                                              unsigned cb_index,
 167                                              unsigned offset,
 168                                              unsigned size,
 169                                              struct pipe_resource *buffer)
 170 {
 171         struct pipe_constant_buffer cb;
 172         cb.buffer_size = size;
 173         cb.buffer_offset = offset;
 174         cb.buffer = buffer;
 175         cb.user_buffer = NULL;
 176
 177         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 178 }
 179
 180 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 181 {
 182         u_default_resource_get_handle, /* get_handle */
 183         r600_compute_global_buffer_destroy, /* resource_destroy */
 184         r600_compute_global_transfer_map, /* transfer_map */
 185         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 186         r600_compute_global_transfer_unmap, /* transfer_unmap */
 187         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 188 };
 189
 190 /* We need to define these R600 registers here, because we can't include
 191  * evergreend.h and r600d.h.
 192  */
 193 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 194 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 195
 196 #ifdef HAVE_OPENCL
 197
 198 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
 199                                            struct r600_bytecode *bc,
 200                                            uint64_t symbol_offset,
 201                                            boolean *use_kill)
 202 {
 203        unsigned i;
 204        const unsigned char *config =
 205                radeon_shader_binary_config_start(binary, symbol_offset);
 206
 207        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 208                unsigned reg =
 209                        util_le32_to_cpu(*(uint32_t*)(config + i));
 210                unsigned value =
 211                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 212                switch (reg) {
 213                /* R600 / R700 */
 214                case R_028850_SQ_PGM_RESOURCES_PS:
 215                case R_028868_SQ_PGM_RESOURCES_VS:
 216                /* Evergreen / Northern Islands */
 217                case R_028844_SQ_PGM_RESOURCES_PS:
 218                case R_028860_SQ_PGM_RESOURCES_VS:
 219                case R_0288D4_SQ_PGM_RESOURCES_LS:
 220                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 221                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 222                        break;
 223                case R_02880C_DB_SHADER_CONTROL:
 224                        *use_kill = G_02880C_KILL_ENABLE(value);
 225                        break;
 226                case R_0288E8_SQ_LDS_ALLOC:
 227                        bc->nlds_dw = value;
 228                        break;
 229                }
 230        }
 231 }
 232
 233 static unsigned r600_create_shader(struct r600_bytecode *bc,
 234                                    const struct radeon_shader_binary *binary,
 235                                    boolean *use_kill)
 236
 237 {
 238         assert(binary->code_size % 4 == 0);
 239         bc->bytecode = CALLOC(1, binary->code_size);
 240         memcpy(bc->bytecode, binary->code, binary->code_size);
 241         bc->ndw = binary->code_size / 4;
 242
 243         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 244         return 0;
 245 }
 246
 247 #endif
 248
 249 static void r600_destroy_shader(struct r600_bytecode *bc)
 250 {
 251         FREE(bc->bytecode);
 252 }
 253
 254 static void *evergreen_create_compute_state(struct pipe_context *ctx,
 255                                             const const struct pipe_compute_state *cso)
 256 {
 257         struct r600_context *rctx = (struct r600_context *)ctx;
 258         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 259 #ifdef HAVE_OPENCL
 260         const struct pipe_llvm_program_header *header;
 261         const char *code;
 262         void *p;
 263         boolean use_kill;
 264
 265         COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 266         header = cso->prog;
 267         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 268         radeon_shader_binary_init(&shader->binary);
 269         radeon_elf_read(code, header->num_bytes, &shader->binary);
 270         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 271
 272         shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 273                                                         shader->bc.ndw * 4);
 274         p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 275         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 276         rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 277 #endif
 278
 279         shader->ctx = rctx;
 280         shader->local_size = cso->req_local_mem;
 281         shader->private_size = cso->req_private_mem;
 282         shader->input_size = cso->req_input_mem;
 283
 284         return shader;
 285 }
 286
 287 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 288 {
 289         struct r600_context *rctx = (struct r600_context *)ctx;
 290         struct r600_pipe_compute *shader = state;
 291
 292         COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 293
 294         if (!shader)
 295                 return;
 296
 297         radeon_shader_binary_clean(&shader->binary);
 298         r600_destroy_shader(&shader->bc);
 299
 300         /* TODO destroy shader->code_bo, shader->const_bo
 301          * we'll need something like r600_buffer_free */
 302         FREE(shader);
 303 }
 304
 305 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 306 {
 307         struct r600_context *rctx = (struct r600_context *)ctx;
 308
 309         COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 310
 311         rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 312 }
 313
 314 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 315  * kernel parameters there are implicit parameters that need to be stored
 316  * in the vertex buffer as well.  Here is how these parameters are organized in
 317  * the buffer:
 318  *
 319  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 320  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 321  * DWORDS 6-8: Number of work items within each work group in each dimension
 322  *             (x,y,z)
 323  * DWORDS 9+ : Kernel parameters
 324  */
 325 static void evergreen_compute_upload_input(struct pipe_context *ctx,
 326                                            const struct pipe_grid_info *info)
 327 {
 328         struct r600_context *rctx = (struct r600_context *)ctx;
 329         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 330         unsigned i;
 331         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 332          * parameters.
 333          */
 334         unsigned input_size = shader->input_size + 36;
 335         uint32_t *num_work_groups_start;
 336         uint32_t *global_size_start;
 337         uint32_t *local_size_start;
 338         uint32_t *kernel_parameters_start;
 339         struct pipe_box box;
 340         struct pipe_transfer *transfer = NULL;
 341
 342         if (shader->input_size == 0) {
 343                 return;
 344         }
 345
 346         if (!shader->kernel_param) {
 347                 /* Add space for the grid dimensions */
 348                 shader->kernel_param = (struct r600_resource *)
 349                         pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
 350                                         PIPE_USAGE_IMMUTABLE, input_size);
 351         }
 352
 353         u_box_1d(0, input_size, &box);
 354         num_work_groups_start = ctx->transfer_map(ctx,
 355                         (struct pipe_resource*)shader->kernel_param,
 356                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 357                         &box, &transfer);
 358         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 359         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 360         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 361
 362         /* Copy the work group size */
 363         memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 364
 365         /* Copy the global size */
 366         for (i = 0; i < 3; i++) {
 367                 global_size_start[i] = info->grid[i] * info->block[i];
 368         }
 369
 370         /* Copy the local dimensions */
 371         memcpy(local_size_start, info->block, 3 * sizeof(uint));
 372
 373         /* Copy the kernel inputs */
 374         memcpy(kernel_parameters_start, info->input, shader->input_size);
 375
 376         for (i = 0; i < (input_size / 4); i++) {
 377                 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 378                         ((unsigned*)num_work_groups_start)[i]);
 379         }
 380
 381         ctx->transfer_unmap(ctx, transfer);
 382
 383         /* ID=0 is reserved for the parameters */
 384         evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 385                         (struct pipe_resource*)shader->kernel_param);
 386 }
 387
 388 static void evergreen_emit_dispatch(struct r600_context *rctx,
 389                                     const struct pipe_grid_info *info)
 390 {
 391         int i;
 392         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 393         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 394         unsigned num_waves;
 395         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 396         unsigned wave_divisor = (16 * num_pipes);
 397         int group_size = 1;
 398         int grid_size = 1;
 399         unsigned lds_size = shader->local_size / 4 +
 400                 shader->bc.nlds_dw;
 401
 402
 403         /* Calculate group_size/grid_size */
 404         for (i = 0; i < 3; i++) {
 405                 group_size *= info->block[i];
 406         }
 407
 408         for (i = 0; i < 3; i++) {
 409                 grid_size *= info->grid[i];
 410         }
 411
 412         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 413         num_waves = (info->block[0] * info->block[1] * info->block[2] +
 414                         wave_divisor - 1) / wave_divisor;
 415
 416         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 417                                 "%u wavefronts per thread block, "
 418                                 "allocating %u dwords lds.\n",
 419                                 num_pipes, num_waves, lds_size);
 420
 421         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 422
 423         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 424         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 425         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 426         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 427
 428         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 429                                                                 group_size);
 430
 431         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 432         radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 433         radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 434         radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 435
 436         if (rctx->b.chip_class < CAYMAN) {
 437                 assert(lds_size <= 8192);
 438         } else {
 439                 /* Cayman appears to have a slightly smaller limit, see the
 440                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 441                 assert(lds_size <= 8160);
 442         }
 443
 444         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 445                                         lds_size | (num_waves << 14));
 446
 447         /* Dispatch packet */
 448         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 449         radeon_emit(cs, info->grid[0]);
 450         radeon_emit(cs, info->grid[1]);
 451         radeon_emit(cs, info->grid[2]);
 452         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 453         radeon_emit(cs, 1);
 454 }
 455
 456 static void compute_emit_cs(struct r600_context *rctx,
 457                             const struct pipe_grid_info *info)
 458 {
 459         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 460         unsigned i;
 461
 462         /* make sure that the gfx ring is only one active */
 463         if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
 464                 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 465         }
 466
 467         /* Initialize all the compute-related registers.
 468          *
 469          * See evergreen_init_atom_start_compute_cs() in this file for the list
 470          * of registers initialized by the start_compute_cs_cmd atom.
 471          */
 472         r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 473
 474         /* emit config state */
 475         if (rctx->b.chip_class == EVERGREEN)
 476                 r600_emit_atom(rctx, &rctx->config_state.atom);
 477
 478         rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 479         r600_flush_emit(rctx);
 480
 481         /* Emit colorbuffers. */
 482         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 483         for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
 484                 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
 485                 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 486                                                        (struct r600_resource*)cb->base.texture,
 487                                                        RADEON_USAGE_READWRITE,
 488                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 489
 490                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 491                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 492                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 493                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 494                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 495                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 496                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 497                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 498
 499                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 500                 radeon_emit(cs, reloc);
 501
 502                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 503                 radeon_emit(cs, reloc);
 504         }
 505         for (; i < 8 ; i++)
 506                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 507                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 508         for (; i < 12; i++)
 509                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 510                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 511
 512         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 513         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 514                                         rctx->compute_cb_target_mask);
 515
 516
 517         /* Emit vertex buffer state */
 518         rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
 519         r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 520
 521         /* Emit constant buffer state */
 522         r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 523
 524         /* Emit sampler state */
 525         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 526
 527         /* Emit sampler view (texture resource) state */
 528         r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 529
 530         /* Emit compute shader state */
 531         r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 532
 533         /* Emit dispatch state and dispatch packet */
 534         evergreen_emit_dispatch(rctx, info);
 535
 536         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 537          */
 538         rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 539                       R600_CONTEXT_INV_VERTEX_CACHE |
 540                       R600_CONTEXT_INV_TEX_CACHE;
 541         r600_flush_emit(rctx);
 542         rctx->b.flags = 0;
 543
 544         if (rctx->b.chip_class >= CAYMAN) {
 545                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 546                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 547                 /* DEALLOC_STATE prevents the GPU from hanging when a
 548                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 549                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 550                  */
 551                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 552                 cs->buf[cs->cdw++] = 0;
 553         }
 554
 555 #if 0
 556         COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 557         for (i = 0; i < cs->cdw; i++) {
 558                 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 559         }
 560 #endif
 561
 562 }
 563
 564
 565 /**
 566  * Emit function for r600_cs_shader_state atom
 567  */
 568 void evergreen_emit_cs_shader(struct r600_context *rctx,
 569                               struct r600_atom *atom)
 570 {
 571         struct r600_cs_shader_state *state =
 572                                         (struct r600_cs_shader_state*)atom;
 573         struct r600_pipe_compute *shader = state->shader;
 574         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 575         uint64_t va;
 576         struct r600_resource *code_bo;
 577         unsigned ngpr, nstack;
 578
 579         code_bo = shader->code_bo;
 580         va = shader->code_bo->gpu_address + state->pc;
 581         ngpr = shader->bc.ngpr;
 582         nstack = shader->bc.nstack;
 583
 584         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 585         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 586         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 587                         S_0288D4_NUM_GPRS(ngpr)
 588                         | S_0288D4_STACK_SIZE(nstack));
 589         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 590
 591         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 592         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 593                                               code_bo, RADEON_USAGE_READ,
 594                                               RADEON_PRIO_USER_SHADER));
 595 }
 596
 597 static void evergreen_launch_grid(struct pipe_context *ctx,
 598                                   const struct pipe_grid_info *info)
 599 {
 600         struct r600_context *rctx = (struct r600_context *)ctx;
 601 #ifdef HAVE_OPENCL
 602         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 603         boolean use_kill;
 604
 605         rctx->cs_shader_state.pc = info->pc;
 606         /* Get the config information for this kernel. */
 607         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 608                                   info->pc, &use_kill);
 609 #endif
 610
 611         COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 612
 613
 614         evergreen_compute_upload_input(ctx, info);
 615         compute_emit_cs(rctx, info);
 616 }
 617
 618 static void evergreen_set_compute_resources(struct pipe_context *ctx,
 619                                             unsigned start, unsigned count,
 620                                             struct pipe_surface **surfaces)
 621 {
 622         struct r600_context *rctx = (struct r600_context *)ctx;
 623         struct r600_surface **resources = (struct r600_surface **)surfaces;
 624
 625         COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 626                         start, count);
 627
 628         for (unsigned i = 0; i < count; i++) {
 629                 /* The First two vertex buffers are reserved for parameters and
 630                  * global buffers. */
 631                 unsigned vtx_id = 2 + i;
 632                 if (resources[i]) {
 633                         struct r600_resource_global *buffer =
 634                                 (struct r600_resource_global*)
 635                                 resources[i]->base.texture;
 636                         if (resources[i]->base.writable) {
 637                                 assert(i+1 < 12);
 638
 639                                 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 640                                 (struct r600_resource *)resources[i]->base.texture,
 641                                 buffer->chunk->start_in_dw*4,
 642                                 resources[i]->base.texture->width0);
 643                         }
 644
 645                         evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 646                                         buffer->chunk->start_in_dw * 4,
 647                                         resources[i]->base.texture);
 648                 }
 649         }
 650 }
 651
 652 static void evergreen_set_global_binding(struct pipe_context *ctx,
 653                                          unsigned first, unsigned n,
 654                                          struct pipe_resource **resources,
 655                                          uint32_t **handles)
 656 {
 657         struct r600_context *rctx = (struct r600_context *)ctx;
 658         struct compute_memory_pool *pool = rctx->screen->global_pool;
 659         struct r600_resource_global **buffers =
 660                 (struct r600_resource_global **)resources;
 661         unsigned i;
 662
 663         COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 664                         first, n);
 665
 666         if (!resources) {
 667                 /* XXX: Unset */
 668                 return;
 669         }
 670
 671         /* We mark these items for promotion to the pool if they
 672          * aren't already there */
 673         for (i = first; i < first + n; i++) {
 674                 struct compute_memory_item *item = buffers[i]->chunk;
 675
 676                 if (!is_item_in_pool(item))
 677                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 678         }
 679
 680         if (compute_memory_finalize_pending(pool, ctx) == -1) {
 681                 /* XXX: Unset */
 682                 return;
 683         }
 684
 685         for (i = first; i < first + n; i++)
 686         {
 687                 uint32_t buffer_offset;
 688                 uint32_t handle;
 689                 assert(resources[i]->target == PIPE_BUFFER);
 690                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 691
 692                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 693                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 694
 695                 *(handles[i]) = util_cpu_to_le32(handle);
 696         }
 697
 698         evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 699         evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 700                                 (struct pipe_resource*)pool->bo);
 701 }
 702
 703 /**
 704  * This function initializes all the compute specific registers that need to
 705  * be initialized for each compute command stream.  Registers that are common
 706  * to both compute and 3D will be initialized at the beginning of each compute
 707  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 708  * packet requires that the shader type bit be set, we must initialize all
 709  * context registers needed for compute in this function.  The registers
 710  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 711  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 712  * on the GPU family.
 713  */
 714 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 715 {
 716         struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 717         int num_threads;
 718         int num_stack_entries;
 719
 720         /* since all required registers are initialized in the
 721          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 722          */
 723         r600_init_command_buffer(cb, 256);
 724         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 725
 726         /* This must be first. */
 727         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 728         r600_store_value(cb, 0x80000000);
 729         r600_store_value(cb, 0x80000000);
 730
 731         /* We're setting config registers here. */
 732         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 733         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 734
 735         switch (rctx->b.family) {
 736         case CHIP_CEDAR:
 737         default:
 738                 num_threads = 128;
 739                 num_stack_entries = 256;
 740                 break;
 741         case CHIP_REDWOOD:
 742                 num_threads = 128;
 743                 num_stack_entries = 256;
 744                 break;
 745         case CHIP_JUNIPER:
 746                 num_threads = 128;
 747                 num_stack_entries = 512;
 748                 break;
 749         case CHIP_CYPRESS:
 750         case CHIP_HEMLOCK:
 751                 num_threads = 128;
 752                 num_stack_entries = 512;
 753                 break;
 754         case CHIP_PALM:
 755                 num_threads = 128;
 756                 num_stack_entries = 256;
 757                 break;
 758         case CHIP_SUMO:
 759                 num_threads = 128;
 760                 num_stack_entries = 256;
 761                 break;
 762         case CHIP_SUMO2:
 763                 num_threads = 128;
 764                 num_stack_entries = 512;
 765                 break;
 766         case CHIP_BARTS:
 767                 num_threads = 128;
 768                 num_stack_entries = 512;
 769                 break;
 770         case CHIP_TURKS:
 771                 num_threads = 128;
 772                 num_stack_entries = 256;
 773                 break;
 774         case CHIP_CAICOS:
 775                 num_threads = 128;
 776                 num_stack_entries = 256;
 777                 break;
 778         }
 779
 780         /* Config Registers */
 781         if (rctx->b.chip_class < CAYMAN)
 782                 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
 783                                            rctx->screen->b.info.drm_minor);
 784         else
 785                 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
 786                                         rctx->screen->b.info.drm_minor);
 787
 788         /* The primitive type always needs to be POINTLIST for compute. */
 789         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 790                                                 V_008958_DI_PT_POINTLIST);
 791
 792         if (rctx->b.chip_class < CAYMAN) {
 793
 794                 /* These registers control which simds can be used by each stage.
 795                  * The default for these registers is 0xffffffff, which means
 796                  * all simds are available for each stage.  It's possible we may
 797                  * want to play around with these in the future, but for now
 798                  * the default value is fine.
 799                  *
 800                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 801                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 802                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 803                  */
 804
 805                 /* XXX: We may need to adjust the thread and stack resource
 806                  * values for 3D/compute interop */
 807
 808                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 809
 810                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 811                  * Set the number of threads used by the PS/VS/GS/ES stage to
 812                  * 0.
 813                  */
 814                 r600_store_value(cb, 0);
 815
 816                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 817                  * Set the number of threads used by the CS (aka LS) stage to
 818                  * the maximum number of threads and set the number of threads
 819                  * for the HS stage to 0. */
 820                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 821
 822                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 823                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 824                 r600_store_value(cb, 0);
 825
 826                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 827                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 828                 r600_store_value(cb, 0);
 829
 830                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 831                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 832                  * set it to the maximum value for the CS (aka LS) stage. */
 833                 r600_store_value(cb,
 834                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 835         }
 836         /* Give the compute shader all the available LDS space.
 837          * NOTE: This only sets the maximum number of dwords that a compute
 838          * shader can allocate.  When a shader is executed, we still need to
 839          * allocate the appropriate amount of LDS dwords using the
 840          * CM_R_0288E8_SQ_LDS_ALLOC register.
 841          */
 842         if (rctx->b.chip_class < CAYMAN) {
 843                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 844                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 845         } else {
 846                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 847                         S_0286FC_NUM_PS_LDS(0) |
 848                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 849         }
 850
 851         /* Context Registers */
 852
 853         if (rctx->b.chip_class < CAYMAN) {
 854                 /* workaround for hw issues with dyn gpr - must set all limits
 855                  * to 240 instead of 0, 0x1e == 240 / 8
 856                  */
 857                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 858                                 S_028838_PS_GPRS(0x1e) |
 859                                 S_028838_VS_GPRS(0x1e) |
 860                                 S_028838_GS_GPRS(0x1e) |
 861                                 S_028838_ES_GPRS(0x1e) |
 862                                 S_028838_HS_GPRS(0x1e) |
 863                                 S_028838_LS_GPRS(0x1e));
 864         }
 865
 866         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 867         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 868                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 869
 870         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 871
 872         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 873                                                 S_0286E8_TID_IN_GROUP_ENA
 874                                                 | S_0286E8_TGID_ENA
 875                                                 | S_0286E8_DISABLE_INDEX_PACK)
 876                                                 ;
 877
 878         /* The LOOP_CONST registers are an optimizations for loops that allows
 879          * you to store the initial counter, increment value, and maximum
 880          * counter value in a register so that hardware can calculate the
 881          * correct number of iterations for the loop, so that you don't need
 882          * to have the loop counter in your shader code.  We don't currently use
 883          * this optimization, so we must keep track of the counter in the
 884          * shader and use a break instruction to exit loops.  However, the
 885          * hardware will still uses this register to determine when to exit a
 886          * loop, so we need to initialize the counter to 0, set the increment
 887          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 888          * is the maximum value allowed.  This gives us a maximum of 4096
 889          * iterations for our loops, but hopefully our break instruction will
 890          * execute before some time before the 4096th iteration.
 891          */
 892         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 893 }
 894
 895 void evergreen_init_compute_state_functions(struct r600_context *rctx)
 896 {
 897         rctx->b.b.create_compute_state = evergreen_create_compute_state;
 898         rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 899         rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 900 //       rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 901         rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 902         rctx->b.b.set_global_binding = evergreen_set_global_binding;
 903         rctx->b.b.launch_grid = evergreen_launch_grid;
 904
 905 }
 906
 907 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
 908                                                         const struct pipe_resource *templ)
 909 {
 910         struct r600_resource_global* result = NULL;
 911         struct r600_screen* rscreen = NULL;
 912         int size_in_dw = 0;
 913
 914         assert(templ->target == PIPE_BUFFER);
 915         assert(templ->bind & PIPE_BIND_GLOBAL);
 916         assert(templ->array_size == 1 || templ->array_size == 0);
 917         assert(templ->depth0 == 1 || templ->depth0 == 0);
 918         assert(templ->height0 == 1 || templ->height0 == 0);
 919
 920         result = (struct r600_resource_global*)
 921         CALLOC(sizeof(struct r600_resource_global), 1);
 922         rscreen = (struct r600_screen*)screen;
 923
 924         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 925         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 926                         templ->array_size);
 927
 928         result->base.b.vtbl = &r600_global_buffer_vtbl;
 929         result->base.b.b = *templ;
 930         result->base.b.b.screen = screen;
 931         pipe_reference_init(&result->base.b.b.reference, 1);
 932
 933         size_in_dw = (templ->width0+3) / 4;
 934
 935         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 936
 937         if (result->chunk == NULL)
 938         {
 939                 free(result);
 940                 return NULL;
 941         }
 942
 943         return &result->base.b.b;
 944 }
 945
 946 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 947                                         struct pipe_resource *res)
 948 {
 949         struct r600_resource_global* buffer = NULL;
 950         struct r600_screen* rscreen = NULL;
 951
 952         assert(res->target == PIPE_BUFFER);
 953         assert(res->bind & PIPE_BIND_GLOBAL);
 954
 955         buffer = (struct r600_resource_global*)res;
 956         rscreen = (struct r600_screen*)screen;
 957
 958         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 959
 960         buffer->chunk = NULL;
 961         free(res);
 962 }
 963
 964 void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 965                                        struct pipe_resource *resource,
 966                                        unsigned level,
 967                                        unsigned usage,
 968                                        const struct pipe_box *box,
 969                                        struct pipe_transfer **ptransfer)
 970 {
 971         struct r600_context *rctx = (struct r600_context*)ctx;
 972         struct compute_memory_pool *pool = rctx->screen->global_pool;
 973         struct r600_resource_global* buffer =
 974                 (struct r600_resource_global*)resource;
 975
 976         struct compute_memory_item *item = buffer->chunk;
 977         struct pipe_resource *dst = NULL;
 978         unsigned offset = box->x;
 979
 980         if (is_item_in_pool(item)) {
 981                 compute_memory_demote_item(pool, item, ctx);
 982         }
 983         else {
 984                 if (item->real_buffer == NULL) {
 985                         item->real_buffer =
 986                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
 987                 }
 988         }
 989
 990         dst = (struct pipe_resource*)item->real_buffer;
 991
 992         if (usage & PIPE_TRANSFER_READ)
 993                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
 994
 995         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
 996                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
 997                         "width = %u, height = %u, depth = %u)\n", level, usage,
 998                         box->x, box->y, box->z, box->width, box->height,
 999                         box->depth);
1000         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1001                 "%u (box.x)\n", item->id, box->x);
1002
1003
1004         assert(resource->target == PIPE_BUFFER);
1005         assert(resource->bind & PIPE_BIND_GLOBAL);
1006         assert(box->x >= 0);
1007         assert(box->y == 0);
1008         assert(box->z == 0);
1009
1010         ///TODO: do it better, mapping is not possible if the pool is too big
1011         return pipe_buffer_map_range(ctx, dst,
1012                         offset, box->width, usage, ptransfer);
1013 }
1014
1015 void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1016                                         struct pipe_transfer *transfer)
1017 {
1018         /* struct r600_resource_global are not real resources, they just map
1019          * to an offset within the compute memory pool.  The function
1020          * r600_compute_global_transfer_map() maps the memory pool
1021          * resource rather than the struct r600_resource_global passed to
1022          * it as an argument and then initalizes ptransfer->resource with
1023          * the memory pool resource (via pipe_buffer_map_range).
1024          * When transfer_unmap is called it uses the memory pool's
1025          * vtable which calls r600_buffer_transfer_map() rather than
1026          * this function.
1027          */
1028         assert (!"This function should not be called");
1029 }
1030
1031 void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1032                                                struct pipe_transfer *transfer,
1033                                                const struct pipe_box *box)
1034 {
1035         assert(0 && "TODO");
1036 }
1037
1038 void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
1039                                                struct pipe_resource *resource,
1040                                                unsigned level,
1041                                                unsigned usage,
1042                                                const struct pipe_box *box,
1043                                                const void *data,
1044                                                unsigned stride,
1045                                                unsigned layer_stride)
1046 {
1047         assert(0 && "TODO");
1048 }