src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "r600.h"
  42 #include "evergreend.h"
  43 #include "r600_resource.h"
  44 #include "r600_shader.h"
  45 #include "r600_pipe.h"
  46 #include "r600_formats.h"
  47 #include "evergreen_compute.h"
  48 #include "r600_hw_context_priv.h"
  49 #include "evergreen_compute_internal.h"
  50 #include "compute_memory_pool.h"
  51 #ifdef HAVE_OPENCL
  52 #include "llvm_wrapper.h"
  53 #endif
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 const struct u_resource_vtbl r600_global_buffer_vtbl =
  87 {
  88         u_default_resource_get_handle, /* get_handle */
  89         r600_compute_global_buffer_destroy, /* resource_destroy */
  90         r600_compute_global_get_transfer, /* get_transfer */
  91         r600_compute_global_transfer_destroy, /* transfer_destroy */
  92         r600_compute_global_transfer_map, /* transfer_map */
  93         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
  94         r600_compute_global_transfer_unmap, /* transfer_unmap */
  95         r600_compute_global_transfer_inline_write /* transfer_inline_write */
  96 };
  97
  98
  99 void *evergreen_create_compute_state(
 100         struct pipe_context *ctx_,
 101         const const struct pipe_compute_state *cso)
 102 {
 103         struct r600_context *ctx = (struct r600_context *)ctx_;
 104
 105 #ifdef HAVE_OPENCL
 106         const struct pipe_llvm_program_header * header;
 107         const unsigned char * code;
 108
 109         COMPUTE_DBG("*** evergreen_create_compute_state\n");
 110
 111         header = cso->prog;
 112         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 113 #endif
 114
 115         if (!ctx->screen->screen.get_param(&ctx->screen->screen,
 116                                                         PIPE_CAP_COMPUTE)) {
 117                 fprintf(stderr, "Compute is not supported\n");
 118                 return NULL;
 119         }
 120         struct r600_pipe_compute *shader =      CALLOC_STRUCT(r600_pipe_compute);
 121
 122         shader->ctx = (struct r600_context*)ctx;
 123         shader->resources = (struct evergreen_compute_resource*)
 124                         CALLOC(sizeof(struct evergreen_compute_resource),
 125                         get_compute_resource_num());
 126         shader->local_size = cso->req_local_mem; ///TODO: assert it
 127         shader->private_size = cso->req_private_mem;
 128         shader->input_size = cso->req_input_mem;
 129
 130 #ifdef HAVE_OPENCL
 131         shader->mod = llvm_parse_bitcode(code, header->num_bytes);
 132
 133         r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
 134 #endif
 135         return shader;
 136 }
 137
 138 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 139 {
 140         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 141
 142         free(shader->resources);
 143         free(shader);
 144 }
 145
 146 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 147 {
 148         struct r600_context *ctx = (struct r600_context *)ctx_;
 149
 150         COMPUTE_DBG("*** evergreen_bind_compute_state\n");
 151
 152         ctx->cs_shader = (struct r600_pipe_compute *)state;
 153
 154         if (!ctx->cs_shader->shader_code_bo) {
 155
 156                 ctx->cs_shader->shader_code_bo =
 157                         r600_compute_buffer_alloc_vram(ctx->screen,
 158                                         ctx->cs_shader->bc.ndw * 4);
 159
 160                 void *p = ctx->ws->buffer_map(
 161                                         ctx->cs_shader->shader_code_bo->cs_buf,
 162                                         ctx->cs, PIPE_TRANSFER_WRITE);
 163
 164                 memcpy(p, ctx->cs_shader->bc.bytecode, ctx->cs_shader->bc.ndw * 4);
 165
 166                 ctx->ws->buffer_unmap(ctx->cs_shader->shader_code_bo->cs_buf);
 167
 168         }
 169
 170         struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
 171                                                 COMPUTE_RESOURCE_SHADER, 0);
 172
 173         if (ctx->chip_class < CAYMAN) {
 174                 evergreen_reg_set(res, R_008C0C_SQ_GPR_RESOURCE_MGMT_3,
 175                         S_008C0C_NUM_LS_GPRS(ctx->cs_shader->bc.ngpr));
 176         }
 177
 178         ///maybe we can use it later
 179         evergreen_reg_set(res, R_0286C8_SPI_THREAD_GROUPING, 0);
 180         ///maybe we can use it later
 181         evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
 182
 183         evergreen_reg_set(res, R_0288D4_SQ_PGM_RESOURCES_LS,
 184                 S_0288D4_NUM_GPRS(ctx->cs_shader->bc.ngpr)
 185                 | S_0288D4_STACK_SIZE(ctx->cs_shader->bc.nstack));
 186         evergreen_reg_set(res, R_0288D8_SQ_PGM_RESOURCES_LS_2, 0);
 187
 188         evergreen_reg_set(res, R_0288D0_SQ_PGM_START_LS, 0);
 189         res->bo = ctx->cs_shader->shader_code_bo;
 190         res->usage = RADEON_USAGE_READ;
 191         res->coher_bo_size = ctx->cs_shader->bc.ndw*4;
 192
 193         r600_inval_shader_cache(ctx);
 194
 195 }
 196
 197 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 198  * kernel parameters there are inplicit parameters that need to be stored
 199  * in the vertex buffer as well.  Here is how these parameters are organized in
 200  * the buffer:
 201  *
 202  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 203  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 204  * DWORDS 6-8: Number of work items within each work group in each dimension
 205  *             (x,y,z)
 206  * DWORDS 9+ : Kernel parameters
 207  */
 208 void evergreen_compute_upload_input(
 209         struct pipe_context *ctx_,
 210         const uint *block_layout,
 211         const uint *grid_layout,
 212         const void *input)
 213 {
 214         struct r600_context *ctx = (struct r600_context *)ctx_;
 215         int i;
 216         unsigned kernel_parameters_offset_bytes = 36;
 217         uint32_t * num_work_groups_start;
 218         uint32_t * global_size_start;
 219         uint32_t * local_size_start;
 220         uint32_t * kernel_parameters_start;
 221
 222         if (ctx->cs_shader->input_size == 0) {
 223                 return;
 224         }
 225
 226         if (!ctx->cs_shader->kernel_param) {
 227                 unsigned buffer_size = ctx->cs_shader->input_size;
 228
 229                 /* Add space for the grid dimensions */
 230                 buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
 231                 ctx->cs_shader->kernel_param =
 232                                 r600_compute_buffer_alloc_vram(ctx->screen,
 233                                                 buffer_size);
 234         }
 235
 236         num_work_groups_start = ctx->ws->buffer_map(
 237                         ctx->cs_shader->kernel_param->cs_buf,
 238                         ctx->cs, PIPE_TRANSFER_WRITE);
 239         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 240         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 241         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 242
 243         /* Copy the work group size */
 244         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 245
 246         /* Copy the global size */
 247         for (i = 0; i < 3; i++) {
 248                 global_size_start[i] = grid_layout[i] * block_layout[i];
 249         }
 250
 251         /* Copy the local dimensions */
 252         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 253
 254         /* Copy the kernel inputs */
 255         memcpy(kernel_parameters_start, input, ctx->cs_shader->input_size);
 256
 257         for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
 258                                         (ctx->cs_shader->input_size / 4); i++) {
 259                 COMPUTE_DBG("input %i : %i\n", i,
 260                         ((unsigned*)num_work_groups_start)[i]);
 261         }
 262
 263         ctx->ws->buffer_unmap(ctx->cs_shader->kernel_param->cs_buf);
 264
 265         ///ID=0 is reserved for the parameters
 266         evergreen_set_vtx_resource(ctx->cs_shader,
 267                 ctx->cs_shader->kernel_param, 0, 0, 0);
 268         ///ID=0 is reserved for parameters
 269         evergreen_set_const_cache(ctx->cs_shader, 0,
 270                 ctx->cs_shader->kernel_param, ctx->cs_shader->input_size, 0);
 271 }
 272
 273 void evergreen_direct_dispatch(
 274                 struct pipe_context *ctx_,
 275                 const uint *block_layout, const uint *grid_layout)
 276 {
 277         /* This struct r600_context* must be called rctx, because the
 278          * r600_pipe_state_add_reg macro assumes there is a local variable
 279          * of type struct r600_context* called rctx.
 280          */
 281         struct r600_context *rctx = (struct r600_context *)ctx_;
 282
 283         int i;
 284
 285         struct evergreen_compute_resource* res = get_empty_res(rctx->cs_shader,
 286                 COMPUTE_RESOURCE_DISPATCH, 0);
 287
 288         /* Set CB_TARGET_MASK */
 289         evergreen_reg_set(res, R_028238_CB_TARGET_MASK, rctx->compute_cb_target_mask);
 290
 291         evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST);
 292
 293         evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0);
 294         evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0);
 295         evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0);
 296
 297         evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]);
 298         evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]);
 299         evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]);
 300
 301         int group_size = 1;
 302
 303         int grid_size = 1;
 304
 305         for (i = 0; i < 3; i++) {
 306                 group_size *= block_layout[i];
 307         }
 308
 309         for (i = 0; i < 3; i++) {
 310                 grid_size *= grid_layout[i];
 311         }
 312
 313         evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size);
 314         evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size);
 315
 316         evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 317         evergreen_emit_raw_value(res, grid_layout[0]);
 318         evergreen_emit_raw_value(res, grid_layout[1]);
 319         evergreen_emit_raw_value(res, grid_layout[2]);
 320         ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
 321         evergreen_emit_raw_value(res, 1);
 322 }
 323
 324 static void compute_emit_cs(struct r600_context *ctx)
 325 {
 326         struct radeon_winsys_cs *cs = ctx->cs;
 327         int i;
 328
 329         struct r600_resource *onebo = NULL;
 330         struct r600_pipe_state *cb_state;
 331
 332         /* Initialize all the registers common to both 3D and compute.  Some
 333          * 3D only register will be initialized by this atom as well, but
 334          * this is OK for now.
 335          *
 336          * See evergreen_init_atom_start_cs() or cayman_init_atom_start_cs() in
 337          * evergreen_state.c for the list of registers that are intialized by
 338          * the start_cs_cmd atom.
 339          */
 340         r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
 341
 342         /* Initialize all the compute specific registers.
 343          *
 344          * See evergreen_init_atom_start_compute_cs() in this file for the list
 345          * of registers initialized by the start_compuet_cs_cmd atom.
 346          */
 347         r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom);
 348
 349         /* Emit cb_state */
 350         cb_state = ctx->states[R600_PIPE_STATE_FRAMEBUFFER];
 351         r600_context_pipe_state_emit(ctx, cb_state, RADEON_CP_PACKET3_COMPUTE_MODE);
 352
 353         for (i = 0; i < get_compute_resource_num(); i++) {
 354                 if (ctx->cs_shader->resources[i].enabled) {
 355                         int j;
 356                         COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
 357
 358                         for (j = 0; j < ctx->cs_shader->resources[i].cs_end; j++) {
 359                                 if (ctx->cs_shader->resources[i].do_reloc[j]) {
 360                                         assert(ctx->cs_shader->resources[i].bo);
 361                                         evergreen_emit_ctx_reloc(ctx,
 362                                                 ctx->cs_shader->resources[i].bo,
 363                                                 ctx->cs_shader->resources[i].usage);
 364                                 }
 365
 366                                 cs->buf[cs->cdw++] = ctx->cs_shader->resources[i].cs[j];
 367                         }
 368
 369                         if (ctx->cs_shader->resources[i].bo) {
 370                                 onebo = ctx->cs_shader->resources[i].bo;
 371                                 evergreen_emit_ctx_reloc(ctx,
 372                                         ctx->cs_shader->resources[i].bo,
 373                                         ctx->cs_shader->resources[i].usage);
 374
 375                                 ///special case for textures
 376                                 if (ctx->cs_shader->resources[i].do_reloc
 377                                         [ctx->cs_shader->resources[i].cs_end] == 2) {
 378                                         evergreen_emit_ctx_reloc(ctx,
 379                                                 ctx->cs_shader->resources[i].bo,
 380                                                 ctx->cs_shader->resources[i].usage);
 381                                 }
 382                         }
 383                 }
 384         }
 385
 386         /* r600_flush_framebuffer() updates the cb_flush_flags and then
 387          * calls r600_emit_atom() on the ctx->surface_sync_cmd.atom, which emits
 388          * a SURFACE_SYNC packet via r600_emit_surface_sync().
 389          *
 390          * XXX r600_emit_surface_sync() hardcodes the CP_COHER_SIZE to
 391          * 0xffffffff, so we will need to add a field to struct
 392          * r600_surface_sync_cmd if we want to manually set this value.
 393          */
 394         r600_flush_framebuffer(ctx, true /* Flush now */);
 395
 396 #if 0
 397         COMPUTE_DBG("cdw: %i\n", cs->cdw);
 398         for (i = 0; i < cs->cdw; i++) {
 399                 COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
 400         }
 401 #endif
 402
 403         ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE);
 404
 405         ctx->pm4_dirty_cdwords = 0;
 406         ctx->flags = 0;
 407
 408         COMPUTE_DBG("shader started\n");
 409
 410         ctx->ws->buffer_wait(onebo->buf, 0);
 411
 412         COMPUTE_DBG("...\n");
 413
 414         ctx->streamout_start = TRUE;
 415         ctx->streamout_append_bitmask = ~0;
 416
 417 }
 418
 419 static void evergreen_launch_grid(
 420                 struct pipe_context *ctx_,
 421                 const uint *block_layout, const uint *grid_layout,
 422                 uint32_t pc, const void *input)
 423 {
 424         COMPUTE_DBG("PC: %i\n", pc);
 425
 426         struct r600_context *ctx = (struct r600_context *)ctx_;
 427         unsigned num_waves;
 428         unsigned num_pipes = ctx->screen->info.r600_max_pipes;
 429         unsigned wave_divisor = (16 * num_pipes);
 430
 431         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 432         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 433                         wave_divisor - 1) / wave_divisor;
 434
 435         COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
 436                                                         num_pipes, num_waves);
 437
 438         evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves);
 439         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 440         evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
 441         compute_emit_cs(ctx);
 442 }
 443
 444 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 445                 unsigned start, unsigned count,
 446                 struct pipe_surface ** surfaces)
 447 {
 448         struct r600_context *ctx = (struct r600_context *)ctx_;
 449         struct r600_surface **resources = (struct r600_surface **)surfaces;
 450
 451         COMPUTE_DBG("*** evergreen_set_compute_resources: start = %u count = %u\n",
 452                         start, count);
 453
 454         for (int i = 0; i < count; i++) {
 455                 if (resources[i]) {
 456                         struct r600_resource_global *buffer =
 457                                 (struct r600_resource_global*)resources[i]->base.texture;
 458                         if (resources[i]->base.writable) {
 459                                 assert(i+1 < 12);
 460                                 struct r600_resource_global *buffer =
 461                                         (struct r600_resource_global*)
 462                                         resources[i]->base.texture;
 463
 464                                 evergreen_set_rat(ctx->cs_shader, i+1,
 465                                 (struct r600_resource *)resources[i]->base.texture,
 466                                 buffer->chunk->start_in_dw*4,
 467                                 resources[i]->base.texture->width0);
 468                         }
 469
 470                         evergreen_set_vtx_resource(ctx->cs_shader,
 471                                 (struct r600_resource *)resources[i]->base.texture, i+2,
 472                                  buffer->chunk->start_in_dw*4, resources[i]->base.writable);
 473                 }
 474         }
 475
 476 }
 477
 478 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 479                 unsigned start_slot, unsigned count,
 480                 struct pipe_sampler_view **views)
 481 {
 482         struct r600_context *ctx = (struct r600_context *)ctx_;
 483         struct r600_pipe_sampler_view **resource =
 484                 (struct r600_pipe_sampler_view **)views;
 485
 486         for (int i = 0; i < count; i++) {
 487                 if (resource[i]) {
 488                         assert(i+1 < 12);
 489                         ///FETCH0 = VTX0 (param buffer),
 490                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 491                         evergreen_set_tex_resource(ctx->cs_shader, resource[i], i+2);
 492                 }
 493         }
 494 }
 495
 496 static void evergreen_bind_compute_sampler_states(
 497         struct pipe_context *ctx_,
 498         unsigned start_slot,
 499         unsigned num_samplers,
 500         void **samplers_)
 501 {
 502         struct r600_context *ctx = (struct r600_context *)ctx_;
 503         struct compute_sampler_state ** samplers =
 504                 (struct compute_sampler_state **)samplers_;
 505
 506         for (int i = 0; i < num_samplers; i++) {
 507                 if (samplers[i]) {
 508                         evergreen_set_sampler_resource(ctx->cs_shader, samplers[i], i);
 509                 }
 510         }
 511 }
 512
 513 static void evergreen_set_global_binding(
 514         struct pipe_context *ctx_, unsigned first, unsigned n,
 515         struct pipe_resource **resources,
 516         uint32_t **handles)
 517 {
 518         struct r600_context *ctx = (struct r600_context *)ctx_;
 519         struct compute_memory_pool *pool = ctx->screen->global_pool;
 520         struct r600_resource_global **buffers =
 521                 (struct r600_resource_global **)resources;
 522
 523         COMPUTE_DBG("*** evergreen_set_global_binding first = %u n = %u\n",
 524                         first, n);
 525
 526         if (!resources) {
 527                 /* XXX: Unset */
 528                 return;
 529         }
 530
 531         compute_memory_finalize_pending(pool, ctx_);
 532
 533         for (int i = 0; i < n; i++)
 534         {
 535                 assert(resources[i]->target == PIPE_BUFFER);
 536                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 537
 538                 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
 539         }
 540
 541         evergreen_set_rat(ctx->cs_shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 542         evergreen_set_vtx_resource(ctx->cs_shader, pool->bo, 1, 0, 1);
 543 }
 544
 545 /**
 546  * This function initializes all the compute specific registers that need to
 547  * be initialized for each compute command stream.  Registers that are common
 548  * to both compute and 3D will be initialized at the beginning of each compute
 549  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 550  * packet requires that the shader type bit be set, we must initialize all
 551  * context registers needed for compute in this function.  The registers
 552  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 553  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 554  * on the GPU family.
 555  */
 556 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 557 {
 558         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 559         int num_threads;
 560         int num_stack_entries;
 561
 562         /* We aren't passing the EMIT_EARLY flag as the third argument
 563          * because we will be emitting this atom manually in order to
 564          * ensure it gets emitted after the start_cs_cmd atom.
 565          */
 566         r600_init_command_buffer(cb, 256, 0);
 567         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 568
 569         switch (ctx->family) {
 570         case CHIP_CEDAR:
 571         default:
 572                 num_threads = 128;
 573                 num_stack_entries = 256;
 574                 break;
 575         case CHIP_REDWOOD:
 576                 num_threads = 128;
 577                 num_stack_entries = 256;
 578                 break;
 579         case CHIP_JUNIPER:
 580                 num_threads = 128;
 581                 num_stack_entries = 512;
 582                 break;
 583         case CHIP_CYPRESS:
 584         case CHIP_HEMLOCK:
 585                 num_threads = 128;
 586                 num_stack_entries = 512;
 587                 break;
 588         case CHIP_PALM:
 589                 num_threads = 128;
 590                 num_stack_entries = 256;
 591                 break;
 592         case CHIP_SUMO:
 593                 num_threads = 128;
 594                 num_stack_entries = 256;
 595                 break;
 596         case CHIP_SUMO2:
 597                 num_threads = 128;
 598                 num_stack_entries = 512;
 599                 break;
 600         case CHIP_BARTS:
 601                 num_threads = 128;
 602                 num_stack_entries = 512;
 603                 break;
 604         case CHIP_TURKS:
 605                 num_threads = 128;
 606                 num_stack_entries = 256;
 607                 break;
 608         case CHIP_CAICOS:
 609                 num_threads = 128;
 610                 num_stack_entries = 256;
 611                 break;
 612         }
 613
 614         /* Config Registers */
 615         if (ctx->chip_class < CAYMAN) {
 616
 617                 /* These registers control which simds can be used by each stage.
 618                  * The default for these registers is 0xffffffff, which means
 619                  * all simds are available for each stage.  It's possible we may
 620                  * want to play around with these in the future, but for now
 621                  * the default value is fine.
 622                  *
 623                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 624                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 625                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 626                  */
 627
 628                 /* XXX: We may need to adjust the thread and stack resouce
 629                  * values for 3D/compute interop */
 630
 631                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 632
 633                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 634                  * Set the number of threads used by the PS/VS/GS/ES stage to
 635                  * 0.
 636                  */
 637                 r600_store_value(cb, 0);
 638
 639                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 640                  * Set the number of threads used by the CS (aka LS) stage to
 641                  * the maximum number of threads and set the number of threads
 642                  * for the HS stage to 0. */
 643                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 644
 645                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 646                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 647                 r600_store_value(cb, 0);
 648
 649                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 650                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 651                 r600_store_value(cb, 0);
 652
 653                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 654                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 655                  * set it to the maximum value for the CS (aka LS) stage. */
 656                 r600_store_value(cb,
 657                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 658         }
 659
 660         /* Context Registers */
 661
 662         if (ctx->chip_class < CAYMAN) {
 663                 /* workaround for hw issues with dyn gpr - must set all limits
 664                  * to 240 instead of 0, 0x1e == 240 / 8
 665                  */
 666                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 667                                 S_028838_PS_GPRS(0x1e) |
 668                                 S_028838_VS_GPRS(0x1e) |
 669                                 S_028838_GS_GPRS(0x1e) |
 670                                 S_028838_ES_GPRS(0x1e) |
 671                                 S_028838_HS_GPRS(0x1e) |
 672                                 S_028838_LS_GPRS(0x1e));
 673         }
 674
 675         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 676         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 677                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 678
 679         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 680
 681         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 682                                                 S_0286E8_TID_IN_GROUP_ENA
 683                                                 | S_0286E8_TGID_ENA
 684                                                 | S_0286E8_DISABLE_INDEX_PACK)
 685                                                 ;
 686
 687         /* The LOOP_CONST registers are an optimizations for loops that allows
 688          * you to store the initial counter, increment value, and maximum
 689          * counter value in a register so that hardware can calculate the
 690          * correct number of iterations for the loop, so that you don't need
 691          * to have the loop counter in your shader code.  We don't currently use
 692          * this optimization, so we must keep track of the counter in the
 693          * shader and use a break instruction to exit loops.  However, the
 694          * hardware will still uses this register to determine when to exit a
 695          * loop, so we need to initialize the counter to 0, set the increment
 696          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 697          * is the maximum value allowed.  This gives us a maximum of 4096
 698          * iterations for our loops, but hopefully our break instruction will
 699          * execute before some time before the 4096th iteration.
 700          */
 701         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 702 }
 703
 704 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 705 {
 706         ctx->context.create_compute_state = evergreen_create_compute_state;
 707         ctx->context.delete_compute_state = evergreen_delete_compute_state;
 708         ctx->context.bind_compute_state = evergreen_bind_compute_state;
 709 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 710         ctx->context.set_compute_resources = evergreen_set_compute_resources;
 711         ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
 712         ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
 713         ctx->context.set_global_binding = evergreen_set_global_binding;
 714         ctx->context.launch_grid = evergreen_launch_grid;
 715 }
 716
 717
 718 struct pipe_resource *r600_compute_global_buffer_create(
 719         struct pipe_screen *screen,
 720         const struct pipe_resource *templ)
 721 {
 722         assert(templ->target == PIPE_BUFFER);
 723         assert(templ->bind & PIPE_BIND_GLOBAL);
 724         assert(templ->array_size == 1 || templ->array_size == 0);
 725         assert(templ->depth0 == 1 || templ->depth0 == 0);
 726         assert(templ->height0 == 1 || templ->height0 == 0);
 727
 728         struct r600_resource_global* result = (struct r600_resource_global*)
 729                 CALLOC(sizeof(struct r600_resource_global), 1);
 730         struct r600_screen* rscreen = (struct r600_screen*)screen;
 731
 732         COMPUTE_DBG("*** r600_compute_global_buffer_create\n");
 733         COMPUTE_DBG("width = %u array_size = %u\n", templ->width0,
 734                         templ->array_size);
 735
 736         result->base.b.vtbl = &r600_global_buffer_vtbl;
 737         result->base.b.b.screen = screen;
 738         result->base.b.b = *templ;
 739         pipe_reference_init(&result->base.b.b.reference, 1);
 740
 741         int size_in_dw = (templ->width0+3) / 4;
 742
 743         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 744
 745         if (result->chunk == NULL)
 746         {
 747                 free(result);
 748                 return NULL;
 749         }
 750
 751         return &result->base.b.b;
 752 }
 753
 754 void r600_compute_global_buffer_destroy(
 755         struct pipe_screen *screen,
 756         struct pipe_resource *res)
 757 {
 758         assert(res->target == PIPE_BUFFER);
 759         assert(res->bind & PIPE_BIND_GLOBAL);
 760
 761         struct r600_resource_global* buffer = (struct r600_resource_global*)res;
 762         struct r600_screen* rscreen = (struct r600_screen*)screen;
 763
 764         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 765
 766         buffer->chunk = NULL;
 767         free(res);
 768 }
 769
 770 void* r600_compute_global_transfer_map(
 771         struct pipe_context *ctx_,
 772         struct pipe_transfer* transfer)
 773 {
 774         assert(transfer->resource->target == PIPE_BUFFER);
 775         assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
 776         assert(transfer->box.x >= 0);
 777         assert(transfer->box.y == 0);
 778         assert(transfer->box.z == 0);
 779
 780         struct r600_context *ctx = (struct r600_context *)ctx_;
 781         struct r600_resource_global* buffer =
 782                 (struct r600_resource_global*)transfer->resource;
 783
 784         uint32_t* map;
 785         ///TODO: do it better, mapping is not possible if the pool is too big
 786
 787         if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
 788                                                 ctx->cs, transfer->usage))) {
 789                 return NULL;
 790         }
 791
 792         COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
 793         return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
 794 }
 795
 796 void r600_compute_global_transfer_unmap(
 797         struct pipe_context *ctx_,
 798         struct pipe_transfer* transfer)
 799 {
 800         assert(transfer->resource->target == PIPE_BUFFER);
 801         assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
 802
 803         struct r600_context *ctx = (struct r600_context *)ctx_;
 804         struct r600_resource_global* buffer =
 805                 (struct r600_resource_global*)transfer->resource;
 806
 807         ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
 808 }
 809
 810 struct pipe_transfer * r600_compute_global_get_transfer(
 811         struct pipe_context *ctx_,
 812         struct pipe_resource *resource,
 813         unsigned level,
 814         unsigned usage,
 815         const struct pipe_box *box)
 816 {
 817         struct r600_context *ctx = (struct r600_context *)ctx_;
 818         struct compute_memory_pool *pool = ctx->screen->global_pool;
 819
 820         compute_memory_finalize_pending(pool, ctx_);
 821
 822         assert(resource->target == PIPE_BUFFER);
 823         struct r600_context *rctx = (struct r600_context*)ctx_;
 824         struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
 825
 826         transfer->resource = resource;
 827         transfer->level = level;
 828         transfer->usage = usage;
 829         transfer->box = *box;
 830         transfer->stride = 0;
 831         transfer->layer_stride = 0;
 832         transfer->data = NULL;
 833
 834         /* Note strides are zero, this is ok for buffers, but not for
 835         * textures 2d & higher at least.
 836         */
 837         return transfer;
 838 }
 839
 840 void r600_compute_global_transfer_destroy(
 841         struct pipe_context *ctx_,
 842         struct pipe_transfer *transfer)
 843 {
 844         struct r600_context *rctx = (struct r600_context*)ctx_;
 845         util_slab_free(&rctx->pool_transfers, transfer);
 846 }
 847
 848 void r600_compute_global_transfer_flush_region(
 849         struct pipe_context *ctx_,
 850         struct pipe_transfer *transfer,
 851         const struct pipe_box *box)
 852 {
 853         assert(0 && "TODO");
 854 }
 855
 856 void r600_compute_global_transfer_inline_write(
 857         struct pipe_context *pipe,
 858         struct pipe_resource *resource,
 859         unsigned level,
 860         unsigned usage,
 861         const struct pipe_box *box,
 862         const void *data,
 863         unsigned stride,
 864         unsigned layer_stride)
 865 {
 866         assert(0 && "TODO");
 867 }