src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         r600_mark_atom_dirty(rctx, &state->atom);
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195 /* We need to define these R600 registers here, because we can't include
 196  * evergreend.h and r600d.h.
 197  */
 198 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 199 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 200
 201 #ifdef HAVE_OPENCL
 202
 203 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
 204                                            struct r600_bytecode *bc,
 205                                            uint64_t symbol_offset,
 206                                            boolean *use_kill)
 207 {
 208        unsigned i;
 209        const unsigned char *config =
 210                radeon_shader_binary_config_start(binary, symbol_offset);
 211
 212        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 213                unsigned reg =
 214                        util_le32_to_cpu(*(uint32_t*)(config + i));
 215                unsigned value =
 216                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 217                switch (reg) {
 218                /* R600 / R700 */
 219                case R_028850_SQ_PGM_RESOURCES_PS:
 220                case R_028868_SQ_PGM_RESOURCES_VS:
 221                /* Evergreen / Northern Islands */
 222                case R_028844_SQ_PGM_RESOURCES_PS:
 223                case R_028860_SQ_PGM_RESOURCES_VS:
 224                case R_0288D4_SQ_PGM_RESOURCES_LS:
 225                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
 226                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
 227                        break;
 228                case R_02880C_DB_SHADER_CONTROL:
 229                        *use_kill = G_02880C_KILL_ENABLE(value);
 230                        break;
 231                case R_0288E8_SQ_LDS_ALLOC:
 232                        bc->nlds_dw = value;
 233                        break;
 234                }
 235        }
 236 }
 237
 238 static unsigned r600_create_shader(struct r600_bytecode *bc,
 239                                    const struct radeon_shader_binary *binary,
 240                                    boolean *use_kill)
 241
 242 {
 243         assert(binary->code_size % 4 == 0);
 244         bc->bytecode = CALLOC(1, binary->code_size);
 245         memcpy(bc->bytecode, binary->code, binary->code_size);
 246         bc->ndw = binary->code_size / 4;
 247
 248         r600_shader_binary_read_config(binary, bc, 0, use_kill);
 249         return 0;
 250 }
 251
 252 #endif
 253
 254 static void r600_destroy_shader(struct r600_bytecode *bc)
 255 {
 256         FREE(bc->bytecode);
 257 }
 258
 259 void *evergreen_create_compute_state(
 260         struct pipe_context *ctx_,
 261         const const struct pipe_compute_state *cso)
 262 {
 263         struct r600_context *ctx = (struct r600_context *)ctx_;
 264         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 265 #ifdef HAVE_OPENCL
 266         const struct pipe_llvm_program_header * header;
 267         const char *code;
 268         void *p;
 269         boolean use_kill;
 270
 271         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 272         header = cso->prog;
 273         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 274         radeon_shader_binary_init(&shader->binary);
 275         radeon_elf_read(code, header->num_bytes, &shader->binary);
 276         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 277
 278         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 279                                                         shader->bc.ndw * 4);
 280         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 281         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 282         ctx->b.ws->buffer_unmap(shader->code_bo->buf);
 283 #endif
 284
 285         shader->ctx = ctx;
 286         shader->local_size = cso->req_local_mem;
 287         shader->private_size = cso->req_private_mem;
 288         shader->input_size = cso->req_input_mem;
 289
 290         return shader;
 291 }
 292
 293 void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 294 {
 295         struct r600_context *ctx = (struct r600_context *)ctx_;
 296         COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
 297         struct r600_pipe_compute *shader = state;
 298
 299         if (!shader)
 300                 return;
 301
 302         radeon_shader_binary_clean(&shader->binary);
 303         r600_destroy_shader(&shader->bc);
 304
 305         /* TODO destroy shader->code_bo, shader->const_bo
 306          * we'll need something like r600_buffer_free */
 307         FREE(shader);
 308 }
 309
 310 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 311 {
 312         struct r600_context *ctx = (struct r600_context *)ctx_;
 313
 314         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 315
 316         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 317 }
 318
 319 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 320  * kernel parameters there are implicit parameters that need to be stored
 321  * in the vertex buffer as well.  Here is how these parameters are organized in
 322  * the buffer:
 323  *
 324  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 325  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 326  * DWORDS 6-8: Number of work items within each work group in each dimension
 327  *             (x,y,z)
 328  * DWORDS 9+ : Kernel parameters
 329  */
 330 void evergreen_compute_upload_input(
 331         struct pipe_context *ctx_,
 332         const uint *block_layout,
 333         const uint *grid_layout,
 334         const void *input)
 335 {
 336         struct r600_context *ctx = (struct r600_context *)ctx_;
 337         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 338         unsigned i;
 339         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 340          * parameters.
 341          */
 342         unsigned input_size = shader->input_size + 36;
 343         uint32_t * num_work_groups_start;
 344         uint32_t * global_size_start;
 345         uint32_t * local_size_start;
 346         uint32_t * kernel_parameters_start;
 347         struct pipe_box box;
 348         struct pipe_transfer *transfer = NULL;
 349
 350         if (shader->input_size == 0) {
 351                 return;
 352         }
 353
 354         if (!shader->kernel_param) {
 355                 /* Add space for the grid dimensions */
 356                 shader->kernel_param = (struct r600_resource *)
 357                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 358                                         PIPE_USAGE_IMMUTABLE, input_size);
 359         }
 360
 361         u_box_1d(0, input_size, &box);
 362         num_work_groups_start = ctx_->transfer_map(ctx_,
 363                         (struct pipe_resource*)shader->kernel_param,
 364                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 365                         &box, &transfer);
 366         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 367         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 368         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 369
 370         /* Copy the work group size */
 371         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 372
 373         /* Copy the global size */
 374         for (i = 0; i < 3; i++) {
 375                 global_size_start[i] = grid_layout[i] * block_layout[i];
 376         }
 377
 378         /* Copy the local dimensions */
 379         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 380
 381         /* Copy the kernel inputs */
 382         memcpy(kernel_parameters_start, input, shader->input_size);
 383
 384         for (i = 0; i < (input_size / 4); i++) {
 385                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 386                         ((unsigned*)num_work_groups_start)[i]);
 387         }
 388
 389         ctx_->transfer_unmap(ctx_, transfer);
 390
 391         /* ID=0 is reserved for the parameters */
 392         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 393                         (struct pipe_resource*)shader->kernel_param);
 394 }
 395
 396 static void evergreen_emit_direct_dispatch(
 397                 struct r600_context *rctx,
 398                 const uint *block_layout, const uint *grid_layout)
 399 {
 400         int i;
 401         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 402         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 403         unsigned num_waves;
 404         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 405         unsigned wave_divisor = (16 * num_pipes);
 406         int group_size = 1;
 407         int grid_size = 1;
 408         unsigned lds_size = shader->local_size / 4 +
 409                 shader->bc.nlds_dw;
 410
 411
 412         /* Calculate group_size/grid_size */
 413         for (i = 0; i < 3; i++) {
 414                 group_size *= block_layout[i];
 415         }
 416
 417         for (i = 0; i < 3; i++) {
 418                 grid_size *= grid_layout[i];
 419         }
 420
 421         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 422         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 423                         wave_divisor - 1) / wave_divisor;
 424
 425         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 426                                 "%u wavefronts per thread block, "
 427                                 "allocating %u dwords lds.\n",
 428                                 num_pipes, num_waves, lds_size);
 429
 430         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 431
 432         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 433         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 434         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 435         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 436
 437         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 438                                                                 group_size);
 439
 440         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 441         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 442         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 443         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 444
 445         if (rctx->b.chip_class < CAYMAN) {
 446                 assert(lds_size <= 8192);
 447         } else {
 448                 /* Cayman appears to have a slightly smaller limit, see the
 449                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 450                 assert(lds_size <= 8160);
 451         }
 452
 453         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 454                                         lds_size | (num_waves << 14));
 455
 456         /* Dispatch packet */
 457         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 458         radeon_emit(cs, grid_layout[0]);
 459         radeon_emit(cs, grid_layout[1]);
 460         radeon_emit(cs, grid_layout[2]);
 461         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 462         radeon_emit(cs, 1);
 463 }
 464
 465 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 466                 const uint *grid_layout)
 467 {
 468         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 469         unsigned i;
 470
 471         /* make sure that the gfx ring is only one active */
 472         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 473                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 474         }
 475
 476         /* Initialize all the compute-related registers.
 477          *
 478          * See evergreen_init_atom_start_compute_cs() in this file for the list
 479          * of registers initialized by the start_compute_cs_cmd atom.
 480          */
 481         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 482
 483         /* emit config state */
 484         if (ctx->b.chip_class == EVERGREEN)
 485                 r600_emit_atom(ctx, &ctx->config_state.atom);
 486
 487         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 488         r600_flush_emit(ctx);
 489
 490         /* Emit colorbuffers. */
 491         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 492         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 493                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 494                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 495                                                        (struct r600_resource*)cb->base.texture,
 496                                                        RADEON_USAGE_READWRITE,
 497                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 498
 499                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 500                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 501                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 502                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 503                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 504                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 505                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 506                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 507
 508                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 509                 radeon_emit(cs, reloc);
 510
 511                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 512                 radeon_emit(cs, reloc);
 513         }
 514         for (; i < 8 ; i++)
 515                 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 516                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 517         for (; i < 12; i++)
 518                 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 519                                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 520
 521         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 522         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 523                                         ctx->compute_cb_target_mask);
 524
 525
 526         /* Emit vertex buffer state */
 527         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 528         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 529
 530         /* Emit constant buffer state */
 531         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 532
 533         /* Emit sampler state */
 534         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 535
 536         /* Emit sampler view (texture resource) state */
 537         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 538
 539         /* Emit compute shader state */
 540         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 541
 542         /* Emit dispatch state and dispatch packet */
 543         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 544
 545         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 546          */
 547         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 548                       R600_CONTEXT_INV_VERTEX_CACHE |
 549                       R600_CONTEXT_INV_TEX_CACHE;
 550         r600_flush_emit(ctx);
 551         ctx->b.flags = 0;
 552
 553         if (ctx->b.chip_class >= CAYMAN) {
 554                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 555                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 556                 /* DEALLOC_STATE prevents the GPU from hanging when a
 557                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 558                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 559                  */
 560                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 561                 cs->buf[cs->cdw++] = 0;
 562         }
 563
 564 #if 0
 565         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 566         for (i = 0; i < cs->cdw; i++) {
 567                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 568         }
 569 #endif
 570
 571 }
 572
 573
 574 /**
 575  * Emit function for r600_cs_shader_state atom
 576  */
 577 void evergreen_emit_cs_shader(
 578                 struct r600_context *rctx,
 579                 struct r600_atom *atom)
 580 {
 581         struct r600_cs_shader_state *state =
 582                                         (struct r600_cs_shader_state*)atom;
 583         struct r600_pipe_compute *shader = state->shader;
 584         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 585         uint64_t va;
 586         struct r600_resource *code_bo;
 587         unsigned ngpr, nstack;
 588
 589         code_bo = shader->code_bo;
 590         va = shader->code_bo->gpu_address + state->pc;
 591         ngpr = shader->bc.ngpr;
 592         nstack = shader->bc.nstack;
 593
 594         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 595         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 596         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 597                         S_0288D4_NUM_GPRS(ngpr)
 598                         | S_0288D4_STACK_SIZE(nstack));
 599         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 600
 601         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 602         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 603                                               code_bo, RADEON_USAGE_READ,
 604                                               RADEON_PRIO_USER_SHADER));
 605 }
 606
 607 static void evergreen_launch_grid(
 608                 struct pipe_context *ctx_, const struct pipe_grid_info *info)
 609 {
 610         struct r600_context *ctx = (struct r600_context *)ctx_;
 611 #ifdef HAVE_OPENCL
 612         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 613         boolean use_kill;
 614
 615         ctx->cs_shader_state.pc = info->pc;
 616         /* Get the config information for this kernel. */
 617         r600_shader_binary_read_config(&shader->binary, &shader->bc,
 618                                   info->pc, &use_kill);
 619 #endif
 620
 621         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 622
 623
 624         evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
 625         compute_emit_cs(ctx, info->block, info->grid);
 626 }
 627
 628 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 629                 unsigned start, unsigned count,
 630                 struct pipe_surface ** surfaces)
 631 {
 632         struct r600_context *ctx = (struct r600_context *)ctx_;
 633         struct r600_surface **resources = (struct r600_surface **)surfaces;
 634
 635         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 636                         start, count);
 637
 638         for (unsigned i = 0; i < count; i++) {
 639                 /* The First two vertex buffers are reserved for parameters and
 640                  * global buffers. */
 641                 unsigned vtx_id = 2 + i;
 642                 if (resources[i]) {
 643                         struct r600_resource_global *buffer =
 644                                 (struct r600_resource_global*)
 645                                 resources[i]->base.texture;
 646                         if (resources[i]->base.writable) {
 647                                 assert(i+1 < 12);
 648
 649                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 650                                 (struct r600_resource *)resources[i]->base.texture,
 651                                 buffer->chunk->start_in_dw*4,
 652                                 resources[i]->base.texture->width0);
 653                         }
 654
 655                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 656                                         buffer->chunk->start_in_dw * 4,
 657                                         resources[i]->base.texture);
 658                 }
 659         }
 660 }
 661
 662 static void evergreen_set_global_binding(
 663         struct pipe_context *ctx_, unsigned first, unsigned n,
 664         struct pipe_resource **resources,
 665         uint32_t **handles)
 666 {
 667         struct r600_context *ctx = (struct r600_context *)ctx_;
 668         struct compute_memory_pool *pool = ctx->screen->global_pool;
 669         struct r600_resource_global **buffers =
 670                 (struct r600_resource_global **)resources;
 671         unsigned i;
 672
 673         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 674                         first, n);
 675
 676         if (!resources) {
 677                 /* XXX: Unset */
 678                 return;
 679         }
 680
 681         /* We mark these items for promotion to the pool if they
 682          * aren't already there */
 683         for (i = first; i < first + n; i++) {
 684                 struct compute_memory_item *item = buffers[i]->chunk;
 685
 686                 if (!is_item_in_pool(item))
 687                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 688         }
 689
 690         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 691                 /* XXX: Unset */
 692                 return;
 693         }
 694
 695         for (i = first; i < first + n; i++)
 696         {
 697                 uint32_t buffer_offset;
 698                 uint32_t handle;
 699                 assert(resources[i]->target == PIPE_BUFFER);
 700                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 701
 702                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 703                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 704
 705                 *(handles[i]) = util_cpu_to_le32(handle);
 706         }
 707
 708         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 709         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 710                                 (struct pipe_resource*)pool->bo);
 711 }
 712
 713 /**
 714  * This function initializes all the compute specific registers that need to
 715  * be initialized for each compute command stream.  Registers that are common
 716  * to both compute and 3D will be initialized at the beginning of each compute
 717  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 718  * packet requires that the shader type bit be set, we must initialize all
 719  * context registers needed for compute in this function.  The registers
 720  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 721  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 722  * on the GPU family.
 723  */
 724 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 725 {
 726         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 727         int num_threads;
 728         int num_stack_entries;
 729
 730         /* since all required registers are initialized in the
 731          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 732          */
 733         r600_init_command_buffer(cb, 256);
 734         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 735
 736         /* This must be first. */
 737         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 738         r600_store_value(cb, 0x80000000);
 739         r600_store_value(cb, 0x80000000);
 740
 741         /* We're setting config registers here. */
 742         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 743         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 744
 745         switch (ctx->b.family) {
 746         case CHIP_CEDAR:
 747         default:
 748                 num_threads = 128;
 749                 num_stack_entries = 256;
 750                 break;
 751         case CHIP_REDWOOD:
 752                 num_threads = 128;
 753                 num_stack_entries = 256;
 754                 break;
 755         case CHIP_JUNIPER:
 756                 num_threads = 128;
 757                 num_stack_entries = 512;
 758                 break;
 759         case CHIP_CYPRESS:
 760         case CHIP_HEMLOCK:
 761                 num_threads = 128;
 762                 num_stack_entries = 512;
 763                 break;
 764         case CHIP_PALM:
 765                 num_threads = 128;
 766                 num_stack_entries = 256;
 767                 break;
 768         case CHIP_SUMO:
 769                 num_threads = 128;
 770                 num_stack_entries = 256;
 771                 break;
 772         case CHIP_SUMO2:
 773                 num_threads = 128;
 774                 num_stack_entries = 512;
 775                 break;
 776         case CHIP_BARTS:
 777                 num_threads = 128;
 778                 num_stack_entries = 512;
 779                 break;
 780         case CHIP_TURKS:
 781                 num_threads = 128;
 782                 num_stack_entries = 256;
 783                 break;
 784         case CHIP_CAICOS:
 785                 num_threads = 128;
 786                 num_stack_entries = 256;
 787                 break;
 788         }
 789
 790         /* Config Registers */
 791         if (ctx->b.chip_class < CAYMAN)
 792                 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
 793                                            ctx->screen->b.info.drm_minor);
 794         else
 795                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 796                                         ctx->screen->b.info.drm_minor);
 797
 798         /* The primitive type always needs to be POINTLIST for compute. */
 799         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 800                                                 V_008958_DI_PT_POINTLIST);
 801
 802         if (ctx->b.chip_class < CAYMAN) {
 803
 804                 /* These registers control which simds can be used by each stage.
 805                  * The default for these registers is 0xffffffff, which means
 806                  * all simds are available for each stage.  It's possible we may
 807                  * want to play around with these in the future, but for now
 808                  * the default value is fine.
 809                  *
 810                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 811                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 812                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 813                  */
 814
 815                 /* XXX: We may need to adjust the thread and stack resource
 816                  * values for 3D/compute interop */
 817
 818                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 819
 820                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 821                  * Set the number of threads used by the PS/VS/GS/ES stage to
 822                  * 0.
 823                  */
 824                 r600_store_value(cb, 0);
 825
 826                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 827                  * Set the number of threads used by the CS (aka LS) stage to
 828                  * the maximum number of threads and set the number of threads
 829                  * for the HS stage to 0. */
 830                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 831
 832                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 833                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 834                 r600_store_value(cb, 0);
 835
 836                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 837                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 838                 r600_store_value(cb, 0);
 839
 840                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 841                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 842                  * set it to the maximum value for the CS (aka LS) stage. */
 843                 r600_store_value(cb,
 844                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 845         }
 846         /* Give the compute shader all the available LDS space.
 847          * NOTE: This only sets the maximum number of dwords that a compute
 848          * shader can allocate.  When a shader is executed, we still need to
 849          * allocate the appropriate amount of LDS dwords using the
 850          * CM_R_0288E8_SQ_LDS_ALLOC register.
 851          */
 852         if (ctx->b.chip_class < CAYMAN) {
 853                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 854                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 855         } else {
 856                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 857                         S_0286FC_NUM_PS_LDS(0) |
 858                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 859         }
 860
 861         /* Context Registers */
 862
 863         if (ctx->b.chip_class < CAYMAN) {
 864                 /* workaround for hw issues with dyn gpr - must set all limits
 865                  * to 240 instead of 0, 0x1e == 240 / 8
 866                  */
 867                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 868                                 S_028838_PS_GPRS(0x1e) |
 869                                 S_028838_VS_GPRS(0x1e) |
 870                                 S_028838_GS_GPRS(0x1e) |
 871                                 S_028838_ES_GPRS(0x1e) |
 872                                 S_028838_HS_GPRS(0x1e) |
 873                                 S_028838_LS_GPRS(0x1e));
 874         }
 875
 876         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 877         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 878                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 879
 880         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 881
 882         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 883                                                 S_0286E8_TID_IN_GROUP_ENA
 884                                                 | S_0286E8_TGID_ENA
 885                                                 | S_0286E8_DISABLE_INDEX_PACK)
 886                                                 ;
 887
 888         /* The LOOP_CONST registers are an optimizations for loops that allows
 889          * you to store the initial counter, increment value, and maximum
 890          * counter value in a register so that hardware can calculate the
 891          * correct number of iterations for the loop, so that you don't need
 892          * to have the loop counter in your shader code.  We don't currently use
 893          * this optimization, so we must keep track of the counter in the
 894          * shader and use a break instruction to exit loops.  However, the
 895          * hardware will still uses this register to determine when to exit a
 896          * loop, so we need to initialize the counter to 0, set the increment
 897          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 898          * is the maximum value allowed.  This gives us a maximum of 4096
 899          * iterations for our loops, but hopefully our break instruction will
 900          * execute before some time before the 4096th iteration.
 901          */
 902         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 903 }
 904
 905 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 906 {
 907         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 908         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 909         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 910 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 911         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 912         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 913         ctx->b.b.launch_grid = evergreen_launch_grid;
 914
 915 }
 916
 917 struct pipe_resource *r600_compute_global_buffer_create(
 918         struct pipe_screen *screen,
 919         const struct pipe_resource *templ)
 920 {
 921         struct r600_resource_global* result = NULL;
 922         struct r600_screen* rscreen = NULL;
 923         int size_in_dw = 0;
 924
 925         assert(templ->target == PIPE_BUFFER);
 926         assert(templ->bind & PIPE_BIND_GLOBAL);
 927         assert(templ->array_size == 1 || templ->array_size == 0);
 928         assert(templ->depth0 == 1 || templ->depth0 == 0);
 929         assert(templ->height0 == 1 || templ->height0 == 0);
 930
 931         result = (struct r600_resource_global*)
 932         CALLOC(sizeof(struct r600_resource_global), 1);
 933         rscreen = (struct r600_screen*)screen;
 934
 935         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 936         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 937                         templ->array_size);
 938
 939         result->base.b.vtbl = &r600_global_buffer_vtbl;
 940         result->base.b.b = *templ;
 941         result->base.b.b.screen = screen;
 942         pipe_reference_init(&result->base.b.b.reference, 1);
 943
 944         size_in_dw = (templ->width0+3) / 4;
 945
 946         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 947
 948         if (result->chunk == NULL)
 949         {
 950                 free(result);
 951                 return NULL;
 952         }
 953
 954         return &result->base.b.b;
 955 }
 956
 957 void r600_compute_global_buffer_destroy(
 958         struct pipe_screen *screen,
 959         struct pipe_resource *res)
 960 {
 961         struct r600_resource_global* buffer = NULL;
 962         struct r600_screen* rscreen = NULL;
 963
 964         assert(res->target == PIPE_BUFFER);
 965         assert(res->bind & PIPE_BIND_GLOBAL);
 966
 967         buffer = (struct r600_resource_global*)res;
 968         rscreen = (struct r600_screen*)screen;
 969
 970         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 971
 972         buffer->chunk = NULL;
 973         free(res);
 974 }
 975
 976 void *r600_compute_global_transfer_map(
 977         struct pipe_context *ctx_,
 978         struct pipe_resource *resource,
 979         unsigned level,
 980         unsigned usage,
 981         const struct pipe_box *box,
 982         struct pipe_transfer **ptransfer)
 983 {
 984         struct r600_context *rctx = (struct r600_context*)ctx_;
 985         struct compute_memory_pool *pool = rctx->screen->global_pool;
 986         struct r600_resource_global* buffer =
 987                 (struct r600_resource_global*)resource;
 988
 989         struct compute_memory_item *item = buffer->chunk;
 990         struct pipe_resource *dst = NULL;
 991         unsigned offset = box->x;
 992
 993         if (is_item_in_pool(item)) {
 994                 compute_memory_demote_item(pool, item, ctx_);
 995         }
 996         else {
 997                 if (item->real_buffer == NULL) {
 998                         item->real_buffer =
 999                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1000                 }
1001         }
1002
1003         dst = (struct pipe_resource*)item->real_buffer;
1004
1005         if (usage & PIPE_TRANSFER_READ)
1006                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1007
1008         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1009                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1010                         "width = %u, height = %u, depth = %u)\n", level, usage,
1011                         box->x, box->y, box->z, box->width, box->height,
1012                         box->depth);
1013         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1014                 "%u (box.x)\n", item->id, box->x);
1015
1016
1017         assert(resource->target == PIPE_BUFFER);
1018         assert(resource->bind & PIPE_BIND_GLOBAL);
1019         assert(box->x >= 0);
1020         assert(box->y == 0);
1021         assert(box->z == 0);
1022
1023         ///TODO: do it better, mapping is not possible if the pool is too big
1024         return pipe_buffer_map_range(ctx_, dst,
1025                         offset, box->width, usage, ptransfer);
1026 }
1027
1028 void r600_compute_global_transfer_unmap(
1029         struct pipe_context *ctx_,
1030         struct pipe_transfer* transfer)
1031 {
1032         /* struct r600_resource_global are not real resources, they just map
1033          * to an offset within the compute memory pool.  The function
1034          * r600_compute_global_transfer_map() maps the memory pool
1035          * resource rather than the struct r600_resource_global passed to
1036          * it as an argument and then initalizes ptransfer->resource with
1037          * the memory pool resource (via pipe_buffer_map_range).
1038          * When transfer_unmap is called it uses the memory pool's
1039          * vtable which calls r600_buffer_transfer_map() rather than
1040          * this function.
1041          */
1042         assert (!"This function should not be called");
1043 }
1044
1045 void r600_compute_global_transfer_flush_region(
1046         struct pipe_context *ctx_,
1047         struct pipe_transfer *transfer,
1048         const struct pipe_box *box)
1049 {
1050         assert(0 && "TODO");
1051 }
1052
1053 void r600_compute_global_transfer_inline_write(
1054         struct pipe_context *pipe,
1055         struct pipe_resource *resource,
1056         unsigned level,
1057         unsigned usage,
1058         const struct pipe_box *box,
1059         const void *data,
1060         unsigned stride,
1061         unsigned layer_stride)
1062 {
1063         assert(0 && "TODO");
1064 }