src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         r600_mark_atom_dirty(rctx, &state->atom);
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195
 196 void *evergreen_create_compute_state(
 197         struct pipe_context *ctx_,
 198         const const struct pipe_compute_state *cso)
 199 {
 200         struct r600_context *ctx = (struct r600_context *)ctx_;
 201         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const char *code;
 205         void *p;
 206         boolean use_kill;
 207
 208         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211 #if HAVE_LLVM < 0x0306
 212         (void)use_kill;
 213         (void)p;
 214         shader->llvm_ctx = LLVMContextCreate();
 215         shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
 216                                 code, header->num_bytes);
 217         shader->kernels = CALLOC(sizeof(struct r600_kernel),
 218                                 shader->num_kernels);
 219         {
 220                 unsigned i;
 221                 for (i = 0; i < shader->num_kernels; i++) {
 222                         struct r600_kernel *kernel = &shader->kernels[i];
 223                         kernel->llvm_module = radeon_llvm_get_kernel_module(
 224                                 shader->llvm_ctx, i, code, header->num_bytes);
 225                 }
 226         }
 227 #else
 228         radeon_shader_binary_init(&shader->binary);
 229         radeon_elf_read(code, header->num_bytes, &shader->binary);
 230         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 231
 232         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 233                                                         shader->bc.ndw * 4);
 234         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 235         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 236         ctx->b.ws->buffer_unmap(shader->code_bo->buf);
 237 #endif
 238 #endif
 239
 240         shader->ctx = ctx;
 241         shader->local_size = cso->req_local_mem;
 242         shader->private_size = cso->req_private_mem;
 243         shader->input_size = cso->req_input_mem;
 244
 245         return shader;
 246 }
 247
 248 void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 249 {
 250         struct r600_context *ctx = (struct r600_context *)ctx_;
 251         COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
 252         struct r600_pipe_compute *shader = state;
 253
 254         if (!shader)
 255                 return;
 256
 257 #ifdef HAVE_OPENCL
 258 #if HAVE_LLVM < 0x0306
 259         for (unsigned i = 0; i < shader->num_kernels; i++) {
 260                 struct r600_kernel *kernel = &shader->kernels[i];
 261                 LLVMDisposeModule(module);
 262         }
 263         FREE(shader->kernels);
 264         LLVMContextDispose(shader->llvm_ctx);
 265 #else
 266         radeon_shader_binary_clean(&shader->binary);
 267         r600_destroy_shader(&shader->bc);
 268
 269         /* TODO destroy shader->code_bo, shader->const_bo
 270          * we'll need something like r600_buffer_free */
 271 #endif
 272 #endif
 273         FREE(shader);
 274 }
 275
 276 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 277 {
 278         struct r600_context *ctx = (struct r600_context *)ctx_;
 279
 280         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 281
 282         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 283 }
 284
 285 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 286  * kernel parameters there are implicit parameters that need to be stored
 287  * in the vertex buffer as well.  Here is how these parameters are organized in
 288  * the buffer:
 289  *
 290  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 291  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 292  * DWORDS 6-8: Number of work items within each work group in each dimension
 293  *             (x,y,z)
 294  * DWORDS 9+ : Kernel parameters
 295  */
 296 void evergreen_compute_upload_input(
 297         struct pipe_context *ctx_,
 298         const uint *block_layout,
 299         const uint *grid_layout,
 300         const void *input)
 301 {
 302         struct r600_context *ctx = (struct r600_context *)ctx_;
 303         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 304         unsigned i;
 305         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 306          * parameters.
 307          */
 308         unsigned input_size = shader->input_size + 36;
 309         uint32_t * num_work_groups_start;
 310         uint32_t * global_size_start;
 311         uint32_t * local_size_start;
 312         uint32_t * kernel_parameters_start;
 313         struct pipe_box box;
 314         struct pipe_transfer *transfer = NULL;
 315
 316         if (shader->input_size == 0) {
 317                 return;
 318         }
 319
 320         if (!shader->kernel_param) {
 321                 /* Add space for the grid dimensions */
 322                 shader->kernel_param = (struct r600_resource *)
 323                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 324                                         PIPE_USAGE_IMMUTABLE, input_size);
 325         }
 326
 327         u_box_1d(0, input_size, &box);
 328         num_work_groups_start = ctx_->transfer_map(ctx_,
 329                         (struct pipe_resource*)shader->kernel_param,
 330                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 331                         &box, &transfer);
 332         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 333         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 334         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 335
 336         /* Copy the work group size */
 337         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 338
 339         /* Copy the global size */
 340         for (i = 0; i < 3; i++) {
 341                 global_size_start[i] = grid_layout[i] * block_layout[i];
 342         }
 343
 344         /* Copy the local dimensions */
 345         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 346
 347         /* Copy the kernel inputs */
 348         memcpy(kernel_parameters_start, input, shader->input_size);
 349
 350         for (i = 0; i < (input_size / 4); i++) {
 351                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 352                         ((unsigned*)num_work_groups_start)[i]);
 353         }
 354
 355         ctx_->transfer_unmap(ctx_, transfer);
 356
 357         /* ID=0 is reserved for the parameters */
 358         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 359                         (struct pipe_resource*)shader->kernel_param);
 360 }
 361
 362 static void evergreen_emit_direct_dispatch(
 363                 struct r600_context *rctx,
 364                 const uint *block_layout, const uint *grid_layout)
 365 {
 366         int i;
 367         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 368         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 369         unsigned num_waves;
 370         unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 371         unsigned wave_divisor = (16 * num_pipes);
 372         int group_size = 1;
 373         int grid_size = 1;
 374         unsigned lds_size = shader->local_size / 4 +
 375 #if HAVE_LLVM < 0x0306
 376                 shader->active_kernel->bc.nlds_dw;
 377 #else
 378                 shader->bc.nlds_dw;
 379 #endif
 380
 381
 382         /* Calculate group_size/grid_size */
 383         for (i = 0; i < 3; i++) {
 384                 group_size *= block_layout[i];
 385         }
 386
 387         for (i = 0; i < 3; i++) {
 388                 grid_size *= grid_layout[i];
 389         }
 390
 391         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 392         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 393                         wave_divisor - 1) / wave_divisor;
 394
 395         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 396                                 "%u wavefronts per thread block, "
 397                                 "allocating %u dwords lds.\n",
 398                                 num_pipes, num_waves, lds_size);
 399
 400         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 401
 402         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 403         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 404         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 405         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 406
 407         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 408                                                                 group_size);
 409
 410         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 411         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 412         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 413         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 414
 415         if (rctx->b.chip_class < CAYMAN) {
 416                 assert(lds_size <= 8192);
 417         } else {
 418                 /* Cayman appears to have a slightly smaller limit, see the
 419                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 420                 assert(lds_size <= 8160);
 421         }
 422
 423         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 424                                         lds_size | (num_waves << 14));
 425
 426         /* Dispatch packet */
 427         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 428         radeon_emit(cs, grid_layout[0]);
 429         radeon_emit(cs, grid_layout[1]);
 430         radeon_emit(cs, grid_layout[2]);
 431         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 432         radeon_emit(cs, 1);
 433 }
 434
 435 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 436                 const uint *grid_layout)
 437 {
 438         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 439         unsigned i;
 440
 441         /* make sure that the gfx ring is only one active */
 442         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 443                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 444         }
 445
 446         /* Initialize all the compute-related registers.
 447          *
 448          * See evergreen_init_atom_start_compute_cs() in this file for the list
 449          * of registers initialized by the start_compute_cs_cmd atom.
 450          */
 451         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 452
 453         /* emit config state */
 454         if (ctx->b.chip_class == EVERGREEN)
 455                 r600_emit_atom(ctx, &ctx->config_state.atom);
 456
 457         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 458         r600_flush_emit(ctx);
 459
 460         /* Emit colorbuffers. */
 461         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 462         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 463                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 464                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 465                                                        (struct r600_resource*)cb->base.texture,
 466                                                        RADEON_USAGE_READWRITE,
 467                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 468
 469                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 470                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 471                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 472                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 473                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 474                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 475                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 476                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 477
 478                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 479                 radeon_emit(cs, reloc);
 480
 481                 if (!ctx->keep_tiling_flags) {
 482                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 483                         radeon_emit(cs, reloc);
 484                 }
 485
 486                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 487                 radeon_emit(cs, reloc);
 488         }
 489         if (ctx->keep_tiling_flags) {
 490                 for (; i < 8 ; i++) {
 491                         radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 492                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 493                 }
 494                 for (; i < 12; i++) {
 495                         radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 496                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 497                 }
 498         }
 499
 500         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 501         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 502                                         ctx->compute_cb_target_mask);
 503
 504
 505         /* Emit vertex buffer state */
 506         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 507         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 508
 509         /* Emit constant buffer state */
 510         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 511
 512         /* Emit sampler state */
 513         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 514
 515         /* Emit sampler view (texture resource) state */
 516         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 517
 518         /* Emit compute shader state */
 519         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 520
 521         /* Emit dispatch state and dispatch packet */
 522         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 523
 524         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 525          */
 526         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 527                       R600_CONTEXT_INV_VERTEX_CACHE |
 528                       R600_CONTEXT_INV_TEX_CACHE;
 529         r600_flush_emit(ctx);
 530         ctx->b.flags = 0;
 531
 532         if (ctx->b.chip_class >= CAYMAN) {
 533                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 534                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 535                 /* DEALLOC_STATE prevents the GPU from hanging when a
 536                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 537                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 538                  */
 539                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 540                 cs->buf[cs->cdw++] = 0;
 541         }
 542
 543 #if 0
 544         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 545         for (i = 0; i < cs->cdw; i++) {
 546                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 547         }
 548 #endif
 549
 550 }
 551
 552
 553 /**
 554  * Emit function for r600_cs_shader_state atom
 555  */
 556 void evergreen_emit_cs_shader(
 557                 struct r600_context *rctx,
 558                 struct r600_atom *atom)
 559 {
 560         struct r600_cs_shader_state *state =
 561                                         (struct r600_cs_shader_state*)atom;
 562         struct r600_pipe_compute *shader = state->shader;
 563         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 564         uint64_t va;
 565         struct r600_resource *code_bo;
 566         unsigned ngpr, nstack;
 567
 568 #if HAVE_LLVM < 0x0306
 569         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 570         code_bo = kernel->code_bo;
 571         va = kernel->code_bo->gpu_address;
 572         ngpr = kernel->bc.ngpr;
 573         nstack = kernel->bc.nstack;
 574 #else
 575         code_bo = shader->code_bo;
 576         va = shader->code_bo->gpu_address + state->pc;
 577         ngpr = shader->bc.ngpr;
 578         nstack = shader->bc.nstack;
 579 #endif
 580
 581         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 582         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 583         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 584                         S_0288D4_NUM_GPRS(ngpr)
 585                         | S_0288D4_STACK_SIZE(nstack));
 586         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 587
 588         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 589         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 590                                               code_bo, RADEON_USAGE_READ,
 591                                               RADEON_PRIO_USER_SHADER));
 592 }
 593
 594 static void evergreen_launch_grid(
 595                 struct pipe_context *ctx_,
 596                 const uint *block_layout, const uint *grid_layout,
 597                 uint32_t pc, const void *input)
 598 {
 599         struct r600_context *ctx = (struct r600_context *)ctx_;
 600 #ifdef HAVE_OPENCL
 601         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 602         boolean use_kill;
 603
 604 #if HAVE_LLVM < 0x0306
 605         struct r600_kernel *kernel = &shader->kernels[pc];
 606         (void)use_kill;
 607         if (!kernel->code_bo) {
 608                 void *p;
 609                 struct r600_bytecode *bc = &kernel->bc;
 610                 LLVMModuleRef mod = kernel->llvm_module;
 611                 boolean use_kill = false;
 612                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 613                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 614                 unsigned sb_disasm = use_sb ||
 615                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 616
 617                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 618                            ctx->screen->has_compressed_msaa_texturing);
 619                 bc->type = TGSI_PROCESSOR_COMPUTE;
 620                 bc->isa = ctx->isa;
 621                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);
 622
 623                 if (dump && !sb_disasm) {
 624                         r600_bytecode_disasm(bc);
 625                 } else if ((dump && sb_disasm) || use_sb) {
 626                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 627                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 628                 }
 629
 630                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 631                                                         kernel->bc.ndw * 4);
 632                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 633                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 634                 ctx->b.ws->buffer_unmap(kernel->code_bo->buf);
 635         }
 636         shader->active_kernel = kernel;
 637         ctx->cs_shader_state.kernel_index = pc;
 638 #else
 639         ctx->cs_shader_state.pc = pc;
 640         /* Get the config information for this kernel. */
 641         r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
 642 #endif
 643 #endif
 644
 645         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 646
 647
 648         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 649         compute_emit_cs(ctx, block_layout, grid_layout);
 650 }
 651
 652 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 653                 unsigned start, unsigned count,
 654                 struct pipe_surface ** surfaces)
 655 {
 656         struct r600_context *ctx = (struct r600_context *)ctx_;
 657         struct r600_surface **resources = (struct r600_surface **)surfaces;
 658
 659         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 660                         start, count);
 661
 662         for (unsigned i = 0; i < count; i++) {
 663                 /* The First two vertex buffers are reserved for parameters and
 664                  * global buffers. */
 665                 unsigned vtx_id = 2 + i;
 666                 if (resources[i]) {
 667                         struct r600_resource_global *buffer =
 668                                 (struct r600_resource_global*)
 669                                 resources[i]->base.texture;
 670                         if (resources[i]->base.writable) {
 671                                 assert(i+1 < 12);
 672
 673                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 674                                 (struct r600_resource *)resources[i]->base.texture,
 675                                 buffer->chunk->start_in_dw*4,
 676                                 resources[i]->base.texture->width0);
 677                         }
 678
 679                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 680                                         buffer->chunk->start_in_dw * 4,
 681                                         resources[i]->base.texture);
 682                 }
 683         }
 684 }
 685
 686 static void evergreen_set_global_binding(
 687         struct pipe_context *ctx_, unsigned first, unsigned n,
 688         struct pipe_resource **resources,
 689         uint32_t **handles)
 690 {
 691         struct r600_context *ctx = (struct r600_context *)ctx_;
 692         struct compute_memory_pool *pool = ctx->screen->global_pool;
 693         struct r600_resource_global **buffers =
 694                 (struct r600_resource_global **)resources;
 695         unsigned i;
 696
 697         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 698                         first, n);
 699
 700         if (!resources) {
 701                 /* XXX: Unset */
 702                 return;
 703         }
 704
 705         /* We mark these items for promotion to the pool if they
 706          * aren't already there */
 707         for (i = first; i < first + n; i++) {
 708                 struct compute_memory_item *item = buffers[i]->chunk;
 709
 710                 if (!is_item_in_pool(item))
 711                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 712         }
 713
 714         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 715                 /* XXX: Unset */
 716                 return;
 717         }
 718
 719         for (i = first; i < first + n; i++)
 720         {
 721                 uint32_t buffer_offset;
 722                 uint32_t handle;
 723                 assert(resources[i]->target == PIPE_BUFFER);
 724                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 725
 726                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 727                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 728
 729                 *(handles[i]) = util_cpu_to_le32(handle);
 730         }
 731
 732         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 733         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 734                                 (struct pipe_resource*)pool->bo);
 735 }
 736
 737 /**
 738  * This function initializes all the compute specific registers that need to
 739  * be initialized for each compute command stream.  Registers that are common
 740  * to both compute and 3D will be initialized at the beginning of each compute
 741  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 742  * packet requires that the shader type bit be set, we must initialize all
 743  * context registers needed for compute in this function.  The registers
 744  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 745  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 746  * on the GPU family.
 747  */
 748 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 749 {
 750         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 751         int num_threads;
 752         int num_stack_entries;
 753
 754         /* since all required registers are initialized in the
 755          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 756          */
 757         r600_init_command_buffer(cb, 256);
 758         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 759
 760         /* This must be first. */
 761         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 762         r600_store_value(cb, 0x80000000);
 763         r600_store_value(cb, 0x80000000);
 764
 765         /* We're setting config registers here. */
 766         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 767         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 768
 769         switch (ctx->b.family) {
 770         case CHIP_CEDAR:
 771         default:
 772                 num_threads = 128;
 773                 num_stack_entries = 256;
 774                 break;
 775         case CHIP_REDWOOD:
 776                 num_threads = 128;
 777                 num_stack_entries = 256;
 778                 break;
 779         case CHIP_JUNIPER:
 780                 num_threads = 128;
 781                 num_stack_entries = 512;
 782                 break;
 783         case CHIP_CYPRESS:
 784         case CHIP_HEMLOCK:
 785                 num_threads = 128;
 786                 num_stack_entries = 512;
 787                 break;
 788         case CHIP_PALM:
 789                 num_threads = 128;
 790                 num_stack_entries = 256;
 791                 break;
 792         case CHIP_SUMO:
 793                 num_threads = 128;
 794                 num_stack_entries = 256;
 795                 break;
 796         case CHIP_SUMO2:
 797                 num_threads = 128;
 798                 num_stack_entries = 512;
 799                 break;
 800         case CHIP_BARTS:
 801                 num_threads = 128;
 802                 num_stack_entries = 512;
 803                 break;
 804         case CHIP_TURKS:
 805                 num_threads = 128;
 806                 num_stack_entries = 256;
 807                 break;
 808         case CHIP_CAICOS:
 809                 num_threads = 128;
 810                 num_stack_entries = 256;
 811                 break;
 812         }
 813
 814         /* Config Registers */
 815         if (ctx->b.chip_class < CAYMAN)
 816                 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
 817                                            ctx->screen->b.info.drm_minor);
 818         else
 819                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 820                                         ctx->screen->b.info.drm_minor);
 821
 822         /* The primitive type always needs to be POINTLIST for compute. */
 823         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 824                                                 V_008958_DI_PT_POINTLIST);
 825
 826         if (ctx->b.chip_class < CAYMAN) {
 827
 828                 /* These registers control which simds can be used by each stage.
 829                  * The default for these registers is 0xffffffff, which means
 830                  * all simds are available for each stage.  It's possible we may
 831                  * want to play around with these in the future, but for now
 832                  * the default value is fine.
 833                  *
 834                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 835                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 836                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 837                  */
 838
 839                 /* XXX: We may need to adjust the thread and stack resource
 840                  * values for 3D/compute interop */
 841
 842                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 843
 844                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 845                  * Set the number of threads used by the PS/VS/GS/ES stage to
 846                  * 0.
 847                  */
 848                 r600_store_value(cb, 0);
 849
 850                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 851                  * Set the number of threads used by the CS (aka LS) stage to
 852                  * the maximum number of threads and set the number of threads
 853                  * for the HS stage to 0. */
 854                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 855
 856                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 857                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 858                 r600_store_value(cb, 0);
 859
 860                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 861                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 862                 r600_store_value(cb, 0);
 863
 864                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 865                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 866                  * set it to the maximum value for the CS (aka LS) stage. */
 867                 r600_store_value(cb,
 868                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 869         }
 870         /* Give the compute shader all the available LDS space.
 871          * NOTE: This only sets the maximum number of dwords that a compute
 872          * shader can allocate.  When a shader is executed, we still need to
 873          * allocate the appropriate amount of LDS dwords using the
 874          * CM_R_0288E8_SQ_LDS_ALLOC register.
 875          */
 876         if (ctx->b.chip_class < CAYMAN) {
 877                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 878                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 879         } else {
 880                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 881                         S_0286FC_NUM_PS_LDS(0) |
 882                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 883         }
 884
 885         /* Context Registers */
 886
 887         if (ctx->b.chip_class < CAYMAN) {
 888                 /* workaround for hw issues with dyn gpr - must set all limits
 889                  * to 240 instead of 0, 0x1e == 240 / 8
 890                  */
 891                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 892                                 S_028838_PS_GPRS(0x1e) |
 893                                 S_028838_VS_GPRS(0x1e) |
 894                                 S_028838_GS_GPRS(0x1e) |
 895                                 S_028838_ES_GPRS(0x1e) |
 896                                 S_028838_HS_GPRS(0x1e) |
 897                                 S_028838_LS_GPRS(0x1e));
 898         }
 899
 900         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 901         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 902                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 903
 904         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 905
 906         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 907                                                 S_0286E8_TID_IN_GROUP_ENA
 908                                                 | S_0286E8_TGID_ENA
 909                                                 | S_0286E8_DISABLE_INDEX_PACK)
 910                                                 ;
 911
 912         /* The LOOP_CONST registers are an optimizations for loops that allows
 913          * you to store the initial counter, increment value, and maximum
 914          * counter value in a register so that hardware can calculate the
 915          * correct number of iterations for the loop, so that you don't need
 916          * to have the loop counter in your shader code.  We don't currently use
 917          * this optimization, so we must keep track of the counter in the
 918          * shader and use a break instruction to exit loops.  However, the
 919          * hardware will still uses this register to determine when to exit a
 920          * loop, so we need to initialize the counter to 0, set the increment
 921          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 922          * is the maximum value allowed.  This gives us a maximum of 4096
 923          * iterations for our loops, but hopefully our break instruction will
 924          * execute before some time before the 4096th iteration.
 925          */
 926         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 927 }
 928
 929 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 930 {
 931         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 932         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 933         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 934 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 935         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 936         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 937         ctx->b.b.launch_grid = evergreen_launch_grid;
 938
 939 }
 940
 941 struct pipe_resource *r600_compute_global_buffer_create(
 942         struct pipe_screen *screen,
 943         const struct pipe_resource *templ)
 944 {
 945         struct r600_resource_global* result = NULL;
 946         struct r600_screen* rscreen = NULL;
 947         int size_in_dw = 0;
 948
 949         assert(templ->target == PIPE_BUFFER);
 950         assert(templ->bind & PIPE_BIND_GLOBAL);
 951         assert(templ->array_size == 1 || templ->array_size == 0);
 952         assert(templ->depth0 == 1 || templ->depth0 == 0);
 953         assert(templ->height0 == 1 || templ->height0 == 0);
 954
 955         result = (struct r600_resource_global*)
 956         CALLOC(sizeof(struct r600_resource_global), 1);
 957         rscreen = (struct r600_screen*)screen;
 958
 959         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 960         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 961                         templ->array_size);
 962
 963         result->base.b.vtbl = &r600_global_buffer_vtbl;
 964         result->base.b.b.screen = screen;
 965         result->base.b.b = *templ;
 966         pipe_reference_init(&result->base.b.b.reference, 1);
 967
 968         size_in_dw = (templ->width0+3) / 4;
 969
 970         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 971
 972         if (result->chunk == NULL)
 973         {
 974                 free(result);
 975                 return NULL;
 976         }
 977
 978         return &result->base.b.b;
 979 }
 980
 981 void r600_compute_global_buffer_destroy(
 982         struct pipe_screen *screen,
 983         struct pipe_resource *res)
 984 {
 985         struct r600_resource_global* buffer = NULL;
 986         struct r600_screen* rscreen = NULL;
 987
 988         assert(res->target == PIPE_BUFFER);
 989         assert(res->bind & PIPE_BIND_GLOBAL);
 990
 991         buffer = (struct r600_resource_global*)res;
 992         rscreen = (struct r600_screen*)screen;
 993
 994         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 995
 996         buffer->chunk = NULL;
 997         free(res);
 998 }
 999
1000 void *r600_compute_global_transfer_map(
1001         struct pipe_context *ctx_,
1002         struct pipe_resource *resource,
1003         unsigned level,
1004         unsigned usage,
1005         const struct pipe_box *box,
1006         struct pipe_transfer **ptransfer)
1007 {
1008         struct r600_context *rctx = (struct r600_context*)ctx_;
1009         struct compute_memory_pool *pool = rctx->screen->global_pool;
1010         struct r600_resource_global* buffer =
1011                 (struct r600_resource_global*)resource;
1012
1013         struct compute_memory_item *item = buffer->chunk;
1014         struct pipe_resource *dst = NULL;
1015         unsigned offset = box->x;
1016
1017         if (is_item_in_pool(item)) {
1018                 compute_memory_demote_item(pool, item, ctx_);
1019         }
1020         else {
1021                 if (item->real_buffer == NULL) {
1022                         item->real_buffer =
1023                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1024                 }
1025         }
1026
1027         dst = (struct pipe_resource*)item->real_buffer;
1028
1029         if (usage & PIPE_TRANSFER_READ)
1030                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1031
1032         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1033                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1034                         "width = %u, height = %u, depth = %u)\n", level, usage,
1035                         box->x, box->y, box->z, box->width, box->height,
1036                         box->depth);
1037         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1038                 "%u (box.x)\n", item->id, box->x);
1039
1040
1041         assert(resource->target == PIPE_BUFFER);
1042         assert(resource->bind & PIPE_BIND_GLOBAL);
1043         assert(box->x >= 0);
1044         assert(box->y == 0);
1045         assert(box->z == 0);
1046
1047         ///TODO: do it better, mapping is not possible if the pool is too big
1048         return pipe_buffer_map_range(ctx_, dst,
1049                         offset, box->width, usage, ptransfer);
1050 }
1051
1052 void r600_compute_global_transfer_unmap(
1053         struct pipe_context *ctx_,
1054         struct pipe_transfer* transfer)
1055 {
1056         /* struct r600_resource_global are not real resources, they just map
1057          * to an offset within the compute memory pool.  The function
1058          * r600_compute_global_transfer_map() maps the memory pool
1059          * resource rather than the struct r600_resource_global passed to
1060          * it as an argument and then initalizes ptransfer->resource with
1061          * the memory pool resource (via pipe_buffer_map_range).
1062          * When transfer_unmap is called it uses the memory pool's
1063          * vtable which calls r600_buffer_transfer_map() rather than
1064          * this function.
1065          */
1066         assert (!"This function should not be called");
1067 }
1068
1069 void r600_compute_global_transfer_flush_region(
1070         struct pipe_context *ctx_,
1071         struct pipe_transfer *transfer,
1072         const struct pipe_box *box)
1073 {
1074         assert(0 && "TODO");
1075 }
1076
1077 void r600_compute_global_transfer_inline_write(
1078         struct pipe_context *pipe,
1079         struct pipe_resource *resource,
1080         unsigned level,
1081         unsigned usage,
1082         const struct pipe_box *box,
1083         const void *data,
1084         unsigned stride,
1085         unsigned layer_stride)
1086 {
1087         assert(0 && "TODO");
1088 }