src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "r600.h"
  42 #include "evergreend.h"
  43 #include "r600_resource.h"
  44 #include "r600_shader.h"
  45 #include "r600_pipe.h"
  46 #include "r600_formats.h"
  47 #include "evergreen_compute.h"
  48 #include "r600_hw_context_priv.h"
  49 #include "evergreen_compute_internal.h"
  50 #include "compute_memory_pool.h"
  51 #ifdef HAVE_OPENCL
  52 #include "llvm_wrapper.h"
  53 #endif
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 static void evergreen_cs_set_vertex_buffer(
  87         struct r600_context * rctx,
  88         unsigned vb_index,
  89         unsigned offset,
  90         struct pipe_resource * buffer)
  91 {
  92         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
  93         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
  94         vb->stride = 1;
  95         vb->buffer_offset = offset;
  96         vb->buffer = buffer;
  97         vb->user_buffer = NULL;
  98
  99         r600_inval_vertex_cache(rctx);
 100         state->enabled_mask |= 1 << vb_index;
 101         state->dirty_mask |= 1 << vb_index;
 102         r600_atom_dirty(rctx, &state->atom);
 103 }
 104
 105 const struct u_resource_vtbl r600_global_buffer_vtbl =
 106 {
 107         u_default_resource_get_handle, /* get_handle */
 108         r600_compute_global_buffer_destroy, /* resource_destroy */
 109         r600_compute_global_get_transfer, /* get_transfer */
 110         r600_compute_global_transfer_destroy, /* transfer_destroy */
 111         r600_compute_global_transfer_map, /* transfer_map */
 112         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 113         r600_compute_global_transfer_unmap, /* transfer_unmap */
 114         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 115 };
 116
 117
 118 void *evergreen_create_compute_state(
 119         struct pipe_context *ctx_,
 120         const const struct pipe_compute_state *cso)
 121 {
 122         struct r600_context *ctx = (struct r600_context *)ctx_;
 123         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 124         void *p;
 125
 126 #ifdef HAVE_OPENCL
 127         const struct pipe_llvm_program_header * header;
 128         const unsigned char * code;
 129
 130         COMPUTE_DBG("*** evergreen_create_compute_state\n");
 131
 132         header = cso->prog;
 133         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 134 #endif
 135
 136         shader->ctx = (struct r600_context*)ctx;
 137         shader->resources = (struct evergreen_compute_resource*)
 138                         CALLOC(sizeof(struct evergreen_compute_resource),
 139                         get_compute_resource_num());
 140         shader->local_size = cso->req_local_mem; ///TODO: assert it
 141         shader->private_size = cso->req_private_mem;
 142         shader->input_size = cso->req_input_mem;
 143
 144 #ifdef HAVE_OPENCL
 145         shader->mod = llvm_parse_bitcode(code, header->num_bytes);
 146
 147         r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
 148 #endif
 149         shader->shader_code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 150                                                         shader->bc.ndw * 4);
 151
 152         p = ctx->ws->buffer_map(shader->shader_code_bo->cs_buf, ctx->cs,
 153                                                         PIPE_TRANSFER_WRITE);
 154
 155         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 156         ctx->ws->buffer_unmap(shader->shader_code_bo->cs_buf);
 157         return shader;
 158 }
 159
 160 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 161 {
 162         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 163
 164         free(shader->resources);
 165         free(shader);
 166 }
 167
 168 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 169 {
 170         struct r600_context *ctx = (struct r600_context *)ctx_;
 171
 172         COMPUTE_DBG("*** evergreen_bind_compute_state\n");
 173
 174         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 175 }
 176
 177 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 178  * kernel parameters there are inplicit parameters that need to be stored
 179  * in the vertex buffer as well.  Here is how these parameters are organized in
 180  * the buffer:
 181  *
 182  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 183  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 184  * DWORDS 6-8: Number of work items within each work group in each dimension
 185  *             (x,y,z)
 186  * DWORDS 9+ : Kernel parameters
 187  */
 188 void evergreen_compute_upload_input(
 189         struct pipe_context *ctx_,
 190         const uint *block_layout,
 191         const uint *grid_layout,
 192         const void *input)
 193 {
 194         struct r600_context *ctx = (struct r600_context *)ctx_;
 195         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 196         int i;
 197         unsigned kernel_parameters_offset_bytes = 36;
 198         uint32_t * num_work_groups_start;
 199         uint32_t * global_size_start;
 200         uint32_t * local_size_start;
 201         uint32_t * kernel_parameters_start;
 202
 203         if (shader->input_size == 0) {
 204                 return;
 205         }
 206
 207         if (!shader->kernel_param) {
 208                 unsigned buffer_size = shader->input_size;
 209
 210                 /* Add space for the grid dimensions */
 211                 buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
 212                 shader->kernel_param = r600_compute_buffer_alloc_vram(
 213                                                 ctx->screen, buffer_size);
 214         }
 215
 216         num_work_groups_start = ctx->ws->buffer_map(
 217                 shader->kernel_param->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
 218         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 219         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 220         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 221
 222         /* Copy the work group size */
 223         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 224
 225         /* Copy the global size */
 226         for (i = 0; i < 3; i++) {
 227                 global_size_start[i] = grid_layout[i] * block_layout[i];
 228         }
 229
 230         /* Copy the local dimensions */
 231         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 232
 233         /* Copy the kernel inputs */
 234         memcpy(kernel_parameters_start, input, shader->input_size);
 235
 236         for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
 237                                         (shader->input_size / 4); i++) {
 238                 COMPUTE_DBG("input %i : %i\n", i,
 239                         ((unsigned*)num_work_groups_start)[i]);
 240         }
 241
 242         ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);
 243
 244         ///ID=0 is reserved for the parameters
 245         evergreen_cs_set_vertex_buffer(ctx, 0, 0,
 246                         (struct pipe_resource*)shader->kernel_param);
 247         ///ID=0 is reserved for parameters
 248         evergreen_set_const_cache(shader, 0, shader->kernel_param,
 249                                                 shader->input_size, 0);
 250 }
 251
 252 void evergreen_direct_dispatch(
 253                 struct pipe_context *ctx_,
 254                 const uint *block_layout, const uint *grid_layout)
 255 {
 256         /* This struct r600_context* must be called rctx, because the
 257          * r600_pipe_state_add_reg macro assumes there is a local variable
 258          * of type struct r600_context* called rctx.
 259          */
 260         struct r600_context *rctx = (struct r600_context *)ctx_;
 261         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 262
 263         int i;
 264
 265         struct evergreen_compute_resource* res = get_empty_res(shader,
 266                 COMPUTE_RESOURCE_DISPATCH, 0);
 267
 268         /* Set CB_TARGET_MASK */
 269         evergreen_reg_set(res, R_028238_CB_TARGET_MASK, rctx->compute_cb_target_mask);
 270
 271         evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST);
 272
 273         evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0);
 274         evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0);
 275         evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0);
 276
 277         evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]);
 278         evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]);
 279         evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]);
 280
 281         int group_size = 1;
 282
 283         int grid_size = 1;
 284
 285         for (i = 0; i < 3; i++) {
 286                 group_size *= block_layout[i];
 287         }
 288
 289         for (i = 0; i < 3; i++) {
 290                 grid_size *= grid_layout[i];
 291         }
 292
 293         evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size);
 294         evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size);
 295
 296         evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 297         evergreen_emit_raw_value(res, grid_layout[0]);
 298         evergreen_emit_raw_value(res, grid_layout[1]);
 299         evergreen_emit_raw_value(res, grid_layout[2]);
 300         ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
 301         evergreen_emit_raw_value(res, 1);
 302 }
 303
 304 static void compute_emit_cs(struct r600_context *ctx)
 305 {
 306         struct radeon_winsys_cs *cs = ctx->cs;
 307         int i;
 308
 309         struct r600_resource *onebo = NULL;
 310         struct r600_pipe_state *cb_state;
 311         struct evergreen_compute_resource *resources =
 312                                         ctx->cs_shader_state.shader->resources;
 313
 314         /* Initialize all the registers common to both 3D and compute.  Some
 315          * 3D only register will be initialized by this atom as well, but
 316          * this is OK for now.
 317          *
 318          * See evergreen_init_atom_start_cs() or cayman_init_atom_start_cs() in
 319          * evergreen_state.c for the list of registers that are intialized by
 320          * the start_cs_cmd atom.
 321          */
 322         r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
 323
 324         /* Initialize all the compute specific registers.
 325          *
 326          * See evergreen_init_atom_start_compute_cs() in this file for the list
 327          * of registers initialized by the start_compuet_cs_cmd atom.
 328          */
 329         r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom);
 330
 331         /* Emit cb_state */
 332         cb_state = ctx->states[R600_PIPE_STATE_FRAMEBUFFER];
 333         r600_context_pipe_state_emit(ctx, cb_state, RADEON_CP_PACKET3_COMPUTE_MODE);
 334
 335         /* Emit vertex buffer state */
 336         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 337         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 338
 339         /* Emit compute shader state */
 340         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 341
 342         for (i = 0; i < get_compute_resource_num(); i++) {
 343                 if (resources[i].enabled) {
 344                         int j;
 345                         COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
 346
 347                         for (j = 0; j < resources[i].cs_end; j++) {
 348                                 if (resources[i].do_reloc[j]) {
 349                                         assert(resources[i].bo);
 350                                         evergreen_emit_ctx_reloc(ctx,
 351                                                 resources[i].bo,
 352                                                 resources[i].usage);
 353                                 }
 354
 355                                 cs->buf[cs->cdw++] = resources[i].cs[j];
 356                         }
 357
 358                         if (resources[i].bo) {
 359                                 onebo = resources[i].bo;
 360                                 evergreen_emit_ctx_reloc(ctx,
 361                                         resources[i].bo,
 362                                         resources[i].usage);
 363
 364                                 ///special case for textures
 365                                 if (resources[i].do_reloc
 366                                         [resources[i].cs_end] == 2) {
 367                                         evergreen_emit_ctx_reloc(ctx,
 368                                                 resources[i].bo,
 369                                                 resources[i].usage);
 370                                 }
 371                         }
 372                 }
 373         }
 374
 375         /* r600_flush_framebuffer() updates the cb_flush_flags and then
 376          * calls r600_emit_atom() on the ctx->surface_sync_cmd.atom, which emits
 377          * a SURFACE_SYNC packet via r600_emit_surface_sync().
 378          *
 379          * XXX r600_emit_surface_sync() hardcodes the CP_COHER_SIZE to
 380          * 0xffffffff, so we will need to add a field to struct
 381          * r600_surface_sync_cmd if we want to manually set this value.
 382          */
 383         r600_flush_framebuffer(ctx, true /* Flush now */);
 384
 385 #if 0
 386         COMPUTE_DBG("cdw: %i\n", cs->cdw);
 387         for (i = 0; i < cs->cdw; i++) {
 388                 COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
 389         }
 390 #endif
 391
 392         ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE);
 393
 394         ctx->pm4_dirty_cdwords = 0;
 395         ctx->flags = 0;
 396
 397         COMPUTE_DBG("shader started\n");
 398
 399         ctx->ws->buffer_wait(onebo->buf, 0);
 400
 401         COMPUTE_DBG("...\n");
 402
 403         ctx->streamout_start = TRUE;
 404         ctx->streamout_append_bitmask = ~0;
 405
 406 }
 407
 408
 409 /**
 410  * Emit function for r600_cs_shader_state atom
 411  */
 412 void evergreen_emit_cs_shader(
 413                 struct r600_context *rctx,
 414                 struct r600_atom *atom)
 415 {
 416         struct r600_cs_shader_state *state =
 417                                         (struct r600_cs_shader_state*)atom;
 418         struct r600_pipe_compute *shader = state->shader;
 419         struct radeon_winsys_cs *cs = rctx->cs;
 420         uint64_t va;
 421
 422         va = r600_resource_va(&rctx->screen->screen, &shader->shader_code_bo->b.b);
 423
 424         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 425         r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 426         r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 427                         S_0288D4_NUM_GPRS(shader->bc.ngpr)
 428                         | S_0288D4_STACK_SIZE(shader->bc.nstack));
 429         r600_write_value(cs, 0);        /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 430
 431         r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
 432         r600_write_value(cs, r600_context_bo_reloc(rctx, shader->shader_code_bo,
 433                                                         RADEON_USAGE_READ));
 434
 435         r600_inval_shader_cache(rctx);
 436 }
 437
 438 static void evergreen_launch_grid(
 439                 struct pipe_context *ctx_,
 440                 const uint *block_layout, const uint *grid_layout,
 441                 uint32_t pc, const void *input)
 442 {
 443         COMPUTE_DBG("PC: %i\n", pc);
 444
 445         struct r600_context *ctx = (struct r600_context *)ctx_;
 446         unsigned num_waves;
 447         unsigned num_pipes = ctx->screen->info.r600_max_pipes;
 448         unsigned wave_divisor = (16 * num_pipes);
 449
 450         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 451         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 452                         wave_divisor - 1) / wave_divisor;
 453
 454         COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
 455                                                         num_pipes, num_waves);
 456
 457         evergreen_set_lds(ctx->cs_shader_state.shader, 0, 0, num_waves);
 458         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 459         evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
 460         compute_emit_cs(ctx);
 461 }
 462
 463 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 464                 unsigned start, unsigned count,
 465                 struct pipe_surface ** surfaces)
 466 {
 467         struct r600_context *ctx = (struct r600_context *)ctx_;
 468         struct r600_surface **resources = (struct r600_surface **)surfaces;
 469
 470         COMPUTE_DBG("*** evergreen_set_compute_resources: start = %u count = %u\n",
 471                         start, count);
 472
 473         for (int i = 0; i < count; i++) {
 474                 /* The First two vertex buffers are reserved for parameters and
 475                  * global buffers. */
 476                 unsigned vtx_id = 2 + i;
 477                 if (resources[i]) {
 478                         struct r600_resource_global *buffer =
 479                                 (struct r600_resource_global*)
 480                                 resources[i]->base.texture;
 481                         if (resources[i]->base.writable) {
 482                                 assert(i+1 < 12);
 483
 484                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 485                                 (struct r600_resource *)resources[i]->base.texture,
 486                                 buffer->chunk->start_in_dw*4,
 487                                 resources[i]->base.texture->width0);
 488                         }
 489
 490                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 491                                         buffer->chunk->start_in_dw * 4,
 492                                         resources[i]->base.texture);
 493                 }
 494         }
 495 }
 496
 497 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 498                 unsigned start_slot, unsigned count,
 499                 struct pipe_sampler_view **views)
 500 {
 501         struct r600_context *ctx = (struct r600_context *)ctx_;
 502         struct r600_pipe_sampler_view **resource =
 503                 (struct r600_pipe_sampler_view **)views;
 504
 505         for (int i = 0; i < count; i++) {
 506                 if (resource[i]) {
 507                         assert(i+1 < 12);
 508                         ///FETCH0 = VTX0 (param buffer),
 509                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 510                         evergreen_set_tex_resource(ctx->cs_shader_state.shader, resource[i], i+2);
 511                 }
 512         }
 513 }
 514
 515 static void evergreen_bind_compute_sampler_states(
 516         struct pipe_context *ctx_,
 517         unsigned start_slot,
 518         unsigned num_samplers,
 519         void **samplers_)
 520 {
 521         struct r600_context *ctx = (struct r600_context *)ctx_;
 522         struct compute_sampler_state ** samplers =
 523                 (struct compute_sampler_state **)samplers_;
 524
 525         for (int i = 0; i < num_samplers; i++) {
 526                 if (samplers[i]) {
 527                         evergreen_set_sampler_resource(
 528                                 ctx->cs_shader_state.shader, samplers[i], i);
 529                 }
 530         }
 531 }
 532
 533 static void evergreen_set_global_binding(
 534         struct pipe_context *ctx_, unsigned first, unsigned n,
 535         struct pipe_resource **resources,
 536         uint32_t **handles)
 537 {
 538         struct r600_context *ctx = (struct r600_context *)ctx_;
 539         struct compute_memory_pool *pool = ctx->screen->global_pool;
 540         struct r600_resource_global **buffers =
 541                 (struct r600_resource_global **)resources;
 542
 543         COMPUTE_DBG("*** evergreen_set_global_binding first = %u n = %u\n",
 544                         first, n);
 545
 546         if (!resources) {
 547                 /* XXX: Unset */
 548                 return;
 549         }
 550
 551         compute_memory_finalize_pending(pool, ctx_);
 552
 553         for (int i = 0; i < n; i++)
 554         {
 555                 assert(resources[i]->target == PIPE_BUFFER);
 556                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 557
 558                 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
 559         }
 560
 561         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 562         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 563                                 (struct pipe_resource*)pool->bo);
 564 }
 565
 566 /**
 567  * This function initializes all the compute specific registers that need to
 568  * be initialized for each compute command stream.  Registers that are common
 569  * to both compute and 3D will be initialized at the beginning of each compute
 570  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 571  * packet requires that the shader type bit be set, we must initialize all
 572  * context registers needed for compute in this function.  The registers
 573  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 574  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 575  * on the GPU family.
 576  */
 577 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 578 {
 579         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 580         int num_threads;
 581         int num_stack_entries;
 582
 583         /* We aren't passing the EMIT_EARLY flag as the third argument
 584          * because we will be emitting this atom manually in order to
 585          * ensure it gets emitted after the start_cs_cmd atom.
 586          */
 587         r600_init_command_buffer(cb, 256, 0);
 588         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 589
 590         switch (ctx->family) {
 591         case CHIP_CEDAR:
 592         default:
 593                 num_threads = 128;
 594                 num_stack_entries = 256;
 595                 break;
 596         case CHIP_REDWOOD:
 597                 num_threads = 128;
 598                 num_stack_entries = 256;
 599                 break;
 600         case CHIP_JUNIPER:
 601                 num_threads = 128;
 602                 num_stack_entries = 512;
 603                 break;
 604         case CHIP_CYPRESS:
 605         case CHIP_HEMLOCK:
 606                 num_threads = 128;
 607                 num_stack_entries = 512;
 608                 break;
 609         case CHIP_PALM:
 610                 num_threads = 128;
 611                 num_stack_entries = 256;
 612                 break;
 613         case CHIP_SUMO:
 614                 num_threads = 128;
 615                 num_stack_entries = 256;
 616                 break;
 617         case CHIP_SUMO2:
 618                 num_threads = 128;
 619                 num_stack_entries = 512;
 620                 break;
 621         case CHIP_BARTS:
 622                 num_threads = 128;
 623                 num_stack_entries = 512;
 624                 break;
 625         case CHIP_TURKS:
 626                 num_threads = 128;
 627                 num_stack_entries = 256;
 628                 break;
 629         case CHIP_CAICOS:
 630                 num_threads = 128;
 631                 num_stack_entries = 256;
 632                 break;
 633         }
 634
 635         /* Config Registers */
 636         if (ctx->chip_class < CAYMAN) {
 637
 638                 /* These registers control which simds can be used by each stage.
 639                  * The default for these registers is 0xffffffff, which means
 640                  * all simds are available for each stage.  It's possible we may
 641                  * want to play around with these in the future, but for now
 642                  * the default value is fine.
 643                  *
 644                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 645                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 646                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 647                  */
 648
 649                 /* XXX: We may need to adjust the thread and stack resouce
 650                  * values for 3D/compute interop */
 651
 652                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 653
 654                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 655                  * Set the number of threads used by the PS/VS/GS/ES stage to
 656                  * 0.
 657                  */
 658                 r600_store_value(cb, 0);
 659
 660                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 661                  * Set the number of threads used by the CS (aka LS) stage to
 662                  * the maximum number of threads and set the number of threads
 663                  * for the HS stage to 0. */
 664                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 665
 666                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 667                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 668                 r600_store_value(cb, 0);
 669
 670                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 671                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 672                 r600_store_value(cb, 0);
 673
 674                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 675                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 676                  * set it to the maximum value for the CS (aka LS) stage. */
 677                 r600_store_value(cb,
 678                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 679         }
 680
 681         /* Context Registers */
 682
 683         if (ctx->chip_class < CAYMAN) {
 684                 /* workaround for hw issues with dyn gpr - must set all limits
 685                  * to 240 instead of 0, 0x1e == 240 / 8
 686                  */
 687                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 688                                 S_028838_PS_GPRS(0x1e) |
 689                                 S_028838_VS_GPRS(0x1e) |
 690                                 S_028838_GS_GPRS(0x1e) |
 691                                 S_028838_ES_GPRS(0x1e) |
 692                                 S_028838_HS_GPRS(0x1e) |
 693                                 S_028838_LS_GPRS(0x1e));
 694         }
 695
 696         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 697         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 698                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 699
 700         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 701
 702         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 703                                                 S_0286E8_TID_IN_GROUP_ENA
 704                                                 | S_0286E8_TGID_ENA
 705                                                 | S_0286E8_DISABLE_INDEX_PACK)
 706                                                 ;
 707
 708         /* The LOOP_CONST registers are an optimizations for loops that allows
 709          * you to store the initial counter, increment value, and maximum
 710          * counter value in a register so that hardware can calculate the
 711          * correct number of iterations for the loop, so that you don't need
 712          * to have the loop counter in your shader code.  We don't currently use
 713          * this optimization, so we must keep track of the counter in the
 714          * shader and use a break instruction to exit loops.  However, the
 715          * hardware will still uses this register to determine when to exit a
 716          * loop, so we need to initialize the counter to 0, set the increment
 717          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 718          * is the maximum value allowed.  This gives us a maximum of 4096
 719          * iterations for our loops, but hopefully our break instruction will
 720          * execute before some time before the 4096th iteration.
 721          */
 722         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 723 }
 724
 725 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 726 {
 727         ctx->context.create_compute_state = evergreen_create_compute_state;
 728         ctx->context.delete_compute_state = evergreen_delete_compute_state;
 729         ctx->context.bind_compute_state = evergreen_bind_compute_state;
 730 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 731         ctx->context.set_compute_resources = evergreen_set_compute_resources;
 732         ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
 733         ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
 734         ctx->context.set_global_binding = evergreen_set_global_binding;
 735         ctx->context.launch_grid = evergreen_launch_grid;
 736
 737         /* We always use at least two vertex buffers for compute, one for
 738          * parameters and one for global memory */
 739         ctx->cs_vertex_buffer_state.enabled_mask =
 740         ctx->cs_vertex_buffer_state.dirty_mask = 1 | 2;
 741 }
 742
 743
 744 struct pipe_resource *r600_compute_global_buffer_create(
 745         struct pipe_screen *screen,
 746         const struct pipe_resource *templ)
 747 {
 748         assert(templ->target == PIPE_BUFFER);
 749         assert(templ->bind & PIPE_BIND_GLOBAL);
 750         assert(templ->array_size == 1 || templ->array_size == 0);
 751         assert(templ->depth0 == 1 || templ->depth0 == 0);
 752         assert(templ->height0 == 1 || templ->height0 == 0);
 753
 754         struct r600_resource_global* result = (struct r600_resource_global*)
 755                 CALLOC(sizeof(struct r600_resource_global), 1);
 756         struct r600_screen* rscreen = (struct r600_screen*)screen;
 757
 758         COMPUTE_DBG("*** r600_compute_global_buffer_create\n");
 759         COMPUTE_DBG("width = %u array_size = %u\n", templ->width0,
 760                         templ->array_size);
 761
 762         result->base.b.vtbl = &r600_global_buffer_vtbl;
 763         result->base.b.b.screen = screen;
 764         result->base.b.b = *templ;
 765         pipe_reference_init(&result->base.b.b.reference, 1);
 766
 767         int size_in_dw = (templ->width0+3) / 4;
 768
 769         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 770
 771         if (result->chunk == NULL)
 772         {
 773                 free(result);
 774                 return NULL;
 775         }
 776
 777         return &result->base.b.b;
 778 }
 779
 780 void r600_compute_global_buffer_destroy(
 781         struct pipe_screen *screen,
 782         struct pipe_resource *res)
 783 {
 784         assert(res->target == PIPE_BUFFER);
 785         assert(res->bind & PIPE_BIND_GLOBAL);
 786
 787         struct r600_resource_global* buffer = (struct r600_resource_global*)res;
 788         struct r600_screen* rscreen = (struct r600_screen*)screen;
 789
 790         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 791
 792         buffer->chunk = NULL;
 793         free(res);
 794 }
 795
 796 void* r600_compute_global_transfer_map(
 797         struct pipe_context *ctx_,
 798         struct pipe_transfer* transfer)
 799 {
 800         assert(transfer->resource->target == PIPE_BUFFER);
 801         assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
 802         assert(transfer->box.x >= 0);
 803         assert(transfer->box.y == 0);
 804         assert(transfer->box.z == 0);
 805
 806         struct r600_context *ctx = (struct r600_context *)ctx_;
 807         struct r600_resource_global* buffer =
 808                 (struct r600_resource_global*)transfer->resource;
 809
 810         uint32_t* map;
 811         ///TODO: do it better, mapping is not possible if the pool is too big
 812
 813         if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
 814                                                 ctx->cs, transfer->usage))) {
 815                 return NULL;
 816         }
 817
 818         COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
 819         return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
 820 }
 821
 822 void r600_compute_global_transfer_unmap(
 823         struct pipe_context *ctx_,
 824         struct pipe_transfer* transfer)
 825 {
 826         assert(transfer->resource->target == PIPE_BUFFER);
 827         assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
 828
 829         struct r600_context *ctx = (struct r600_context *)ctx_;
 830         struct r600_resource_global* buffer =
 831                 (struct r600_resource_global*)transfer->resource;
 832
 833         ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
 834 }
 835
 836 struct pipe_transfer * r600_compute_global_get_transfer(
 837         struct pipe_context *ctx_,
 838         struct pipe_resource *resource,
 839         unsigned level,
 840         unsigned usage,
 841         const struct pipe_box *box)
 842 {
 843         struct r600_context *ctx = (struct r600_context *)ctx_;
 844         struct compute_memory_pool *pool = ctx->screen->global_pool;
 845
 846         compute_memory_finalize_pending(pool, ctx_);
 847
 848         assert(resource->target == PIPE_BUFFER);
 849         struct r600_context *rctx = (struct r600_context*)ctx_;
 850         struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
 851
 852         transfer->resource = resource;
 853         transfer->level = level;
 854         transfer->usage = usage;
 855         transfer->box = *box;
 856         transfer->stride = 0;
 857         transfer->layer_stride = 0;
 858         transfer->data = NULL;
 859
 860         /* Note strides are zero, this is ok for buffers, but not for
 861         * textures 2d & higher at least.
 862         */
 863         return transfer;
 864 }
 865
 866 void r600_compute_global_transfer_destroy(
 867         struct pipe_context *ctx_,
 868         struct pipe_transfer *transfer)
 869 {
 870         struct r600_context *rctx = (struct r600_context*)ctx_;
 871         util_slab_free(&rctx->pool_transfers, transfer);
 872 }
 873
 874 void r600_compute_global_transfer_flush_region(
 875         struct pipe_context *ctx_,
 876         struct pipe_transfer *transfer,
 877         const struct pipe_box *box)
 878 {
 879         assert(0 && "TODO");
 880 }
 881
 882 void r600_compute_global_transfer_inline_write(
 883         struct pipe_context *pipe,
 884         struct pipe_resource *resource,
 885         unsigned level,
 886         unsigned usage,
 887         const struct pipe_box *box,
 888         const void *data,
 889         unsigned stride,
 890         unsigned layer_stride)
 891 {
 892         assert(0 && "TODO");
 893 }