src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/u_double_list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon_llvm_util.h"
  51 #endif
  52 #include "radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         state->atom.dirty = true;
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195
 196 void *evergreen_create_compute_state(
 197         struct pipe_context *ctx_,
 198         const const struct pipe_compute_state *cso)
 199 {
 200         struct r600_context *ctx = (struct r600_context *)ctx_;
 201         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 202         const struct pipe_llvm_program_header * header;
 203         const char *code;
 204         void *p;
 205         boolean use_kill;
 206
 207         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 208         header = cso->prog;
 209         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 210 #if HAVE_LLVM < 0x0306
 211 #ifdef HAVE_OPENCL
 212         (void)use_kill;
 213         (void)p;
 214         shader->llvm_ctx = LLVMContextCreate();
 215         shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
 216                                 code, header->num_bytes);
 217         shader->kernels = CALLOC(sizeof(struct r600_kernel),
 218                                 shader->num_kernels);
 219         {
 220                 unsigned i;
 221                 for (i = 0; i < shader->num_kernels; i++) {
 222                         struct r600_kernel *kernel = &shader->kernels[i];
 223                         kernel->llvm_module = radeon_llvm_get_kernel_module(
 224                                 shader->llvm_ctx, i, code, header->num_bytes);
 225                 }
 226         }
 227 #endif
 228 #else
 229         memset(&shader->binary, 0, sizeof(shader->binary));
 230         radeon_elf_read(code, header->num_bytes, &shader->binary, true);
 231         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 232
 233         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 234                                                         shader->bc.ndw * 4);
 235         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 236         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 237         ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf);
 238 #endif
 239
 240         shader->ctx = (struct r600_context*)ctx;
 241         shader->local_size = cso->req_local_mem;
 242         shader->private_size = cso->req_private_mem;
 243         shader->input_size = cso->req_input_mem;
 244
 245         return shader;
 246 }
 247
 248 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 249 {
 250         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 251
 252         if (!shader)
 253                 return;
 254
 255         FREE(shader);
 256 }
 257
 258 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 259 {
 260         struct r600_context *ctx = (struct r600_context *)ctx_;
 261
 262         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 263
 264         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 265 }
 266
 267 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 268  * kernel parameters there are implicit parameters that need to be stored
 269  * in the vertex buffer as well.  Here is how these parameters are organized in
 270  * the buffer:
 271  *
 272  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 273  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 274  * DWORDS 6-8: Number of work items within each work group in each dimension
 275  *             (x,y,z)
 276  * DWORDS 9+ : Kernel parameters
 277  */
 278 void evergreen_compute_upload_input(
 279         struct pipe_context *ctx_,
 280         const uint *block_layout,
 281         const uint *grid_layout,
 282         const void *input)
 283 {
 284         struct r600_context *ctx = (struct r600_context *)ctx_;
 285         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 286         unsigned i;
 287         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 288          * parameters.
 289          */
 290         unsigned input_size = shader->input_size + 36;
 291         uint32_t * num_work_groups_start;
 292         uint32_t * global_size_start;
 293         uint32_t * local_size_start;
 294         uint32_t * kernel_parameters_start;
 295         struct pipe_box box;
 296         struct pipe_transfer *transfer = NULL;
 297
 298         if (shader->input_size == 0) {
 299                 return;
 300         }
 301
 302         if (!shader->kernel_param) {
 303                 /* Add space for the grid dimensions */
 304                 shader->kernel_param = (struct r600_resource *)
 305                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 306                                         PIPE_USAGE_IMMUTABLE, input_size);
 307         }
 308
 309         u_box_1d(0, input_size, &box);
 310         num_work_groups_start = ctx_->transfer_map(ctx_,
 311                         (struct pipe_resource*)shader->kernel_param,
 312                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 313                         &box, &transfer);
 314         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 315         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 316         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 317
 318         /* Copy the work group size */
 319         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 320
 321         /* Copy the global size */
 322         for (i = 0; i < 3; i++) {
 323                 global_size_start[i] = grid_layout[i] * block_layout[i];
 324         }
 325
 326         /* Copy the local dimensions */
 327         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 328
 329         /* Copy the kernel inputs */
 330         memcpy(kernel_parameters_start, input, shader->input_size);
 331
 332         for (i = 0; i < (input_size / 4); i++) {
 333                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 334                         ((unsigned*)num_work_groups_start)[i]);
 335         }
 336
 337         ctx_->transfer_unmap(ctx_, transfer);
 338
 339         /* ID=0 is reserved for the parameters */
 340         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 341                         (struct pipe_resource*)shader->kernel_param);
 342 }
 343
 344 static void evergreen_emit_direct_dispatch(
 345                 struct r600_context *rctx,
 346                 const uint *block_layout, const uint *grid_layout)
 347 {
 348         int i;
 349         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 350         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 351         unsigned num_waves;
 352         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 353         unsigned wave_divisor = (16 * num_pipes);
 354         int group_size = 1;
 355         int grid_size = 1;
 356         unsigned lds_size = shader->local_size / 4 +
 357 #if HAVE_LLVM < 0x0306
 358                 shader->active_kernel->bc.nlds_dw;
 359 #else
 360                 shader->bc.nlds_dw;
 361 #endif
 362
 363
 364         /* Calculate group_size/grid_size */
 365         for (i = 0; i < 3; i++) {
 366                 group_size *= block_layout[i];
 367         }
 368
 369         for (i = 0; i < 3; i++) {
 370                 grid_size *= grid_layout[i];
 371         }
 372
 373         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 374         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 375                         wave_divisor - 1) / wave_divisor;
 376
 377         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 378                                 "%u wavefronts per thread block, "
 379                                 "allocating %u dwords lds.\n",
 380                                 num_pipes, num_waves, lds_size);
 381
 382         r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 383
 384         r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 385         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 386         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 387         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 388
 389         r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 390                                                                 group_size);
 391
 392         r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 393         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 394         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 395         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 396
 397         if (rctx->b.chip_class < CAYMAN) {
 398                 assert(lds_size <= 8192);
 399         } else {
 400                 /* Cayman appears to have a slightly smaller limit, see the
 401                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 402                 assert(lds_size <= 8160);
 403         }
 404
 405         r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
 406                                         lds_size | (num_waves << 14));
 407
 408         /* Dispatch packet */
 409         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 410         radeon_emit(cs, grid_layout[0]);
 411         radeon_emit(cs, grid_layout[1]);
 412         radeon_emit(cs, grid_layout[2]);
 413         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 414         radeon_emit(cs, 1);
 415 }
 416
 417 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 418                 const uint *grid_layout)
 419 {
 420         struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 421         unsigned i;
 422
 423         /* make sure that the gfx ring is only one active */
 424         if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
 425                 ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 426         }
 427
 428         /* Initialize all the compute-related registers.
 429          *
 430          * See evergreen_init_atom_start_compute_cs() in this file for the list
 431          * of registers initialized by the start_compute_cs_cmd atom.
 432          */
 433         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 434
 435         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 436         r600_flush_emit(ctx);
 437
 438         /* Emit colorbuffers. */
 439         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 440         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 441                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 442                 unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
 443                                                        (struct r600_resource*)cb->base.texture,
 444                                                        RADEON_USAGE_READWRITE,
 445                                                        RADEON_PRIO_SHADER_RESOURCE_RW);
 446
 447                 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 448                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 449                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 450                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 451                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 452                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 453                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 454                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 455
 456                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 457                 radeon_emit(cs, reloc);
 458
 459                 if (!ctx->keep_tiling_flags) {
 460                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 461                         radeon_emit(cs, reloc);
 462                 }
 463
 464                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 465                 radeon_emit(cs, reloc);
 466         }
 467         if (ctx->keep_tiling_flags) {
 468                 for (; i < 8 ; i++) {
 469                         r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 470                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 471                 }
 472                 for (; i < 12; i++) {
 473                         r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 474                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 475                 }
 476         }
 477
 478         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 479         r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
 480                                         ctx->compute_cb_target_mask);
 481
 482
 483         /* Emit vertex buffer state */
 484         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 485         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 486
 487         /* Emit constant buffer state */
 488         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 489
 490         /* Emit compute shader state */
 491         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 492
 493         /* Emit dispatch state and dispatch packet */
 494         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 495
 496         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 497          */
 498         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 499                       R600_CONTEXT_INV_VERTEX_CACHE |
 500                       R600_CONTEXT_INV_TEX_CACHE;
 501         r600_flush_emit(ctx);
 502         ctx->b.flags = 0;
 503
 504         if (ctx->b.chip_class >= CAYMAN) {
 505                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 506                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 507                 /* DEALLOC_STATE prevents the GPU from hanging when a
 508                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 509                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 510                  */
 511                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 512                 cs->buf[cs->cdw++] = 0;
 513         }
 514
 515 #if 0
 516         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 517         for (i = 0; i < cs->cdw; i++) {
 518                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 519         }
 520 #endif
 521
 522 }
 523
 524
 525 /**
 526  * Emit function for r600_cs_shader_state atom
 527  */
 528 void evergreen_emit_cs_shader(
 529                 struct r600_context *rctx,
 530                 struct r600_atom *atom)
 531 {
 532         struct r600_cs_shader_state *state =
 533                                         (struct r600_cs_shader_state*)atom;
 534         struct r600_pipe_compute *shader = state->shader;
 535         struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 536         uint64_t va;
 537         struct r600_resource *code_bo;
 538         unsigned ngpr, nstack;
 539
 540 #if HAVE_LLVM < 0x0306
 541         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 542         code_bo = kernel->code_bo;
 543         va = kernel->code_bo->gpu_address;
 544         ngpr = kernel->bc.ngpr;
 545         nstack = kernel->bc.nstack;
 546 #else
 547         code_bo = shader->code_bo;
 548         va = shader->code_bo->gpu_address + state->pc;
 549         ngpr = shader->bc.ngpr;
 550         nstack = shader->bc.nstack;
 551 #endif
 552
 553         r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 554         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 555         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 556                         S_0288D4_NUM_GPRS(ngpr)
 557                         | S_0288D4_STACK_SIZE(nstack));
 558         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 559
 560         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 561         radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
 562                                               code_bo, RADEON_USAGE_READ,
 563                                               RADEON_PRIO_SHADER_DATA));
 564 }
 565
 566 static void evergreen_launch_grid(
 567                 struct pipe_context *ctx_,
 568                 const uint *block_layout, const uint *grid_layout,
 569                 uint32_t pc, const void *input)
 570 {
 571         struct r600_context *ctx = (struct r600_context *)ctx_;
 572         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 573         boolean use_kill;
 574
 575 #if HAVE_LLVM < 0x0306
 576 #ifdef HAVE_OPENCL
 577         struct r600_kernel *kernel = &shader->kernels[pc];
 578         (void)use_kill;
 579         if (!kernel->code_bo) {
 580                 void *p;
 581                 struct r600_bytecode *bc = &kernel->bc;
 582                 LLVMModuleRef mod = kernel->llvm_module;
 583                 boolean use_kill = false;
 584                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 585                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 586                 unsigned sb_disasm = use_sb ||
 587                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 588
 589                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 590                            ctx->screen->has_compressed_msaa_texturing);
 591                 bc->type = TGSI_PROCESSOR_COMPUTE;
 592                 bc->isa = ctx->isa;
 593                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 594
 595                 if (dump && !sb_disasm) {
 596                         r600_bytecode_disasm(bc);
 597                 } else if ((dump && sb_disasm) || use_sb) {
 598                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 599                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 600                 }
 601
 602                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 603                                                         kernel->bc.ndw * 4);
 604                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 605                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 606                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 607         }
 608 #endif
 609         shader->active_kernel = kernel;
 610         ctx->cs_shader_state.kernel_index = pc;
 611 #else
 612         ctx->cs_shader_state.pc = pc;
 613         /* Get the config information for this kernel. */
 614         r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
 615 #endif
 616
 617         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 618
 619
 620         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 621         compute_emit_cs(ctx, block_layout, grid_layout);
 622 }
 623
 624 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 625                 unsigned start, unsigned count,
 626                 struct pipe_surface ** surfaces)
 627 {
 628         struct r600_context *ctx = (struct r600_context *)ctx_;
 629         struct r600_surface **resources = (struct r600_surface **)surfaces;
 630
 631         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 632                         start, count);
 633
 634         for (unsigned i = 0; i < count; i++) {
 635                 /* The First two vertex buffers are reserved for parameters and
 636                  * global buffers. */
 637                 unsigned vtx_id = 2 + i;
 638                 if (resources[i]) {
 639                         struct r600_resource_global *buffer =
 640                                 (struct r600_resource_global*)
 641                                 resources[i]->base.texture;
 642                         if (resources[i]->base.writable) {
 643                                 assert(i+1 < 12);
 644
 645                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 646                                 (struct r600_resource *)resources[i]->base.texture,
 647                                 buffer->chunk->start_in_dw*4,
 648                                 resources[i]->base.texture->width0);
 649                         }
 650
 651                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 652                                         buffer->chunk->start_in_dw * 4,
 653                                         resources[i]->base.texture);
 654                 }
 655         }
 656 }
 657
 658 void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
 659                 unsigned start_slot, unsigned count,
 660                 struct pipe_sampler_view **views)
 661 {
 662         struct r600_pipe_sampler_view **resource =
 663                 (struct r600_pipe_sampler_view **)views;
 664
 665         for (unsigned i = 0; i < count; i++)    {
 666                 if (resource[i]) {
 667                         assert(i+1 < 12);
 668                         /* XXX: Implement */
 669                         assert(!"Compute samplers not implemented.");
 670                         ///FETCH0 = VTX0 (param buffer),
 671                         //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
 672                 }
 673         }
 674 }
 675
 676
 677 static void evergreen_set_global_binding(
 678         struct pipe_context *ctx_, unsigned first, unsigned n,
 679         struct pipe_resource **resources,
 680         uint32_t **handles)
 681 {
 682         struct r600_context *ctx = (struct r600_context *)ctx_;
 683         struct compute_memory_pool *pool = ctx->screen->global_pool;
 684         struct r600_resource_global **buffers =
 685                 (struct r600_resource_global **)resources;
 686         unsigned i;
 687
 688         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 689                         first, n);
 690
 691         if (!resources) {
 692                 /* XXX: Unset */
 693                 return;
 694         }
 695
 696         /* We mark these items for promotion to the pool if they
 697          * aren't already there */
 698         for (i = first; i < first + n; i++) {
 699                 struct compute_memory_item *item = buffers[i]->chunk;
 700
 701                 if (!is_item_in_pool(item))
 702                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 703         }
 704
 705         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 706                 /* XXX: Unset */
 707                 return;
 708         }
 709
 710         for (i = first; i < first + n; i++)
 711         {
 712                 uint32_t buffer_offset;
 713                 uint32_t handle;
 714                 assert(resources[i]->target == PIPE_BUFFER);
 715                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 716
 717                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 718                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 719
 720                 *(handles[i]) = util_cpu_to_le32(handle);
 721         }
 722
 723         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 724         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 725                                 (struct pipe_resource*)pool->bo);
 726 }
 727
 728 /**
 729  * This function initializes all the compute specific registers that need to
 730  * be initialized for each compute command stream.  Registers that are common
 731  * to both compute and 3D will be initialized at the beginning of each compute
 732  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 733  * packet requires that the shader type bit be set, we must initialize all
 734  * context registers needed for compute in this function.  The registers
 735  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 736  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 737  * on the GPU family.
 738  */
 739 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 740 {
 741         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 742         int num_threads;
 743         int num_stack_entries;
 744
 745         /* since all required registers are initialised in the
 746          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 747          */
 748         r600_init_command_buffer(cb, 256);
 749         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 750
 751         /* This must be first. */
 752         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 753         r600_store_value(cb, 0x80000000);
 754         r600_store_value(cb, 0x80000000);
 755
 756         /* We're setting config registers here. */
 757         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 758         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 759
 760         switch (ctx->b.family) {
 761         case CHIP_CEDAR:
 762         default:
 763                 num_threads = 128;
 764                 num_stack_entries = 256;
 765                 break;
 766         case CHIP_REDWOOD:
 767                 num_threads = 128;
 768                 num_stack_entries = 256;
 769                 break;
 770         case CHIP_JUNIPER:
 771                 num_threads = 128;
 772                 num_stack_entries = 512;
 773                 break;
 774         case CHIP_CYPRESS:
 775         case CHIP_HEMLOCK:
 776                 num_threads = 128;
 777                 num_stack_entries = 512;
 778                 break;
 779         case CHIP_PALM:
 780                 num_threads = 128;
 781                 num_stack_entries = 256;
 782                 break;
 783         case CHIP_SUMO:
 784                 num_threads = 128;
 785                 num_stack_entries = 256;
 786                 break;
 787         case CHIP_SUMO2:
 788                 num_threads = 128;
 789                 num_stack_entries = 512;
 790                 break;
 791         case CHIP_BARTS:
 792                 num_threads = 128;
 793                 num_stack_entries = 512;
 794                 break;
 795         case CHIP_TURKS:
 796                 num_threads = 128;
 797                 num_stack_entries = 256;
 798                 break;
 799         case CHIP_CAICOS:
 800                 num_threads = 128;
 801                 num_stack_entries = 256;
 802                 break;
 803         }
 804
 805         /* Config Registers */
 806         if (ctx->b.chip_class < CAYMAN)
 807                 evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 808                                            ctx->screen->b.info.drm_minor);
 809         else
 810                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 811                                         ctx->screen->b.info.drm_minor);
 812
 813         /* The primitive type always needs to be POINTLIST for compute. */
 814         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 815                                                 V_008958_DI_PT_POINTLIST);
 816
 817         if (ctx->b.chip_class < CAYMAN) {
 818
 819                 /* These registers control which simds can be used by each stage.
 820                  * The default for these registers is 0xffffffff, which means
 821                  * all simds are available for each stage.  It's possible we may
 822                  * want to play around with these in the future, but for now
 823                  * the default value is fine.
 824                  *
 825                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 826                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 827                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 828                  */
 829
 830                 /* XXX: We may need to adjust the thread and stack resouce
 831                  * values for 3D/compute interop */
 832
 833                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 834
 835                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 836                  * Set the number of threads used by the PS/VS/GS/ES stage to
 837                  * 0.
 838                  */
 839                 r600_store_value(cb, 0);
 840
 841                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 842                  * Set the number of threads used by the CS (aka LS) stage to
 843                  * the maximum number of threads and set the number of threads
 844                  * for the HS stage to 0. */
 845                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 846
 847                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 848                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 849                 r600_store_value(cb, 0);
 850
 851                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 852                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 853                 r600_store_value(cb, 0);
 854
 855                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 856                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 857                  * set it to the maximum value for the CS (aka LS) stage. */
 858                 r600_store_value(cb,
 859                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 860         }
 861         /* Give the compute shader all the available LDS space.
 862          * NOTE: This only sets the maximum number of dwords that a compute
 863          * shader can allocate.  When a shader is executed, we still need to
 864          * allocate the appropriate amount of LDS dwords using the
 865          * CM_R_0288E8_SQ_LDS_ALLOC register.
 866          */
 867         if (ctx->b.chip_class < CAYMAN) {
 868                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 869                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 870         } else {
 871                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 872                         S_0286FC_NUM_PS_LDS(0) |
 873                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 874         }
 875
 876         /* Context Registers */
 877
 878         if (ctx->b.chip_class < CAYMAN) {
 879                 /* workaround for hw issues with dyn gpr - must set all limits
 880                  * to 240 instead of 0, 0x1e == 240 / 8
 881                  */
 882                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 883                                 S_028838_PS_GPRS(0x1e) |
 884                                 S_028838_VS_GPRS(0x1e) |
 885                                 S_028838_GS_GPRS(0x1e) |
 886                                 S_028838_ES_GPRS(0x1e) |
 887                                 S_028838_HS_GPRS(0x1e) |
 888                                 S_028838_LS_GPRS(0x1e));
 889         }
 890
 891         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 892         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 893                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 894
 895         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 896
 897         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 898                                                 S_0286E8_TID_IN_GROUP_ENA
 899                                                 | S_0286E8_TGID_ENA
 900                                                 | S_0286E8_DISABLE_INDEX_PACK)
 901                                                 ;
 902
 903         /* The LOOP_CONST registers are an optimizations for loops that allows
 904          * you to store the initial counter, increment value, and maximum
 905          * counter value in a register so that hardware can calculate the
 906          * correct number of iterations for the loop, so that you don't need
 907          * to have the loop counter in your shader code.  We don't currently use
 908          * this optimization, so we must keep track of the counter in the
 909          * shader and use a break instruction to exit loops.  However, the
 910          * hardware will still uses this register to determine when to exit a
 911          * loop, so we need to initialize the counter to 0, set the increment
 912          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 913          * is the maximum value allowed.  This gives us a maximum of 4096
 914          * iterations for our loops, but hopefully our break instruction will
 915          * execute before some time before the 4096th iteration.
 916          */
 917         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 918 }
 919
 920 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 921 {
 922         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 923         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 924         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 925 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 926         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 927         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 928         ctx->b.b.launch_grid = evergreen_launch_grid;
 929
 930 }
 931
 932 struct pipe_resource *r600_compute_global_buffer_create(
 933         struct pipe_screen *screen,
 934         const struct pipe_resource *templ)
 935 {
 936         struct r600_resource_global* result = NULL;
 937         struct r600_screen* rscreen = NULL;
 938         int size_in_dw = 0;
 939
 940         assert(templ->target == PIPE_BUFFER);
 941         assert(templ->bind & PIPE_BIND_GLOBAL);
 942         assert(templ->array_size == 1 || templ->array_size == 0);
 943         assert(templ->depth0 == 1 || templ->depth0 == 0);
 944         assert(templ->height0 == 1 || templ->height0 == 0);
 945
 946         result = (struct r600_resource_global*)
 947         CALLOC(sizeof(struct r600_resource_global), 1);
 948         rscreen = (struct r600_screen*)screen;
 949
 950         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 951         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 952                         templ->array_size);
 953
 954         result->base.b.vtbl = &r600_global_buffer_vtbl;
 955         result->base.b.b.screen = screen;
 956         result->base.b.b = *templ;
 957         pipe_reference_init(&result->base.b.b.reference, 1);
 958
 959         size_in_dw = (templ->width0+3) / 4;
 960
 961         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 962
 963         if (result->chunk == NULL)
 964         {
 965                 free(result);
 966                 return NULL;
 967         }
 968
 969         return &result->base.b.b;
 970 }
 971
 972 void r600_compute_global_buffer_destroy(
 973         struct pipe_screen *screen,
 974         struct pipe_resource *res)
 975 {
 976         struct r600_resource_global* buffer = NULL;
 977         struct r600_screen* rscreen = NULL;
 978
 979         assert(res->target == PIPE_BUFFER);
 980         assert(res->bind & PIPE_BIND_GLOBAL);
 981
 982         buffer = (struct r600_resource_global*)res;
 983         rscreen = (struct r600_screen*)screen;
 984
 985         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 986
 987         buffer->chunk = NULL;
 988         free(res);
 989 }
 990
 991 void *r600_compute_global_transfer_map(
 992         struct pipe_context *ctx_,
 993         struct pipe_resource *resource,
 994         unsigned level,
 995         unsigned usage,
 996         const struct pipe_box *box,
 997         struct pipe_transfer **ptransfer)
 998 {
 999         struct r600_context *rctx = (struct r600_context*)ctx_;
1000         struct compute_memory_pool *pool = rctx->screen->global_pool;
1001         struct r600_resource_global* buffer =
1002                 (struct r600_resource_global*)resource;
1003
1004         struct compute_memory_item *item = buffer->chunk;
1005         struct pipe_resource *dst = NULL;
1006         unsigned offset = box->x;
1007
1008         if (is_item_in_pool(item)) {
1009                 compute_memory_demote_item(pool, item, ctx_);
1010         }
1011         else {
1012                 if (item->real_buffer == NULL) {
1013                         item->real_buffer = (struct r600_resource*)
1014                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1015                 }
1016         }
1017
1018         dst = (struct pipe_resource*)item->real_buffer;
1019
1020         if (usage & PIPE_TRANSFER_READ)
1021                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1022
1023         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1024                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1025                         "width = %u, height = %u, depth = %u)\n", level, usage,
1026                         box->x, box->y, box->z, box->width, box->height,
1027                         box->depth);
1028         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1029                 "%u (box.x)\n", item->id, box->x);
1030
1031
1032         assert(resource->target == PIPE_BUFFER);
1033         assert(resource->bind & PIPE_BIND_GLOBAL);
1034         assert(box->x >= 0);
1035         assert(box->y == 0);
1036         assert(box->z == 0);
1037
1038         ///TODO: do it better, mapping is not possible if the pool is too big
1039         return pipe_buffer_map_range(ctx_, dst,
1040                         offset, box->width, usage, ptransfer);
1041 }
1042
1043 void r600_compute_global_transfer_unmap(
1044         struct pipe_context *ctx_,
1045         struct pipe_transfer* transfer)
1046 {
1047         /* struct r600_resource_global are not real resources, they just map
1048          * to an offset within the compute memory pool.  The function
1049          * r600_compute_global_transfer_map() maps the memory pool
1050          * resource rather than the struct r600_resource_global passed to
1051          * it as an argument and then initalizes ptransfer->resource with
1052          * the memory pool resource (via pipe_buffer_map_range).
1053          * When transfer_unmap is called it uses the memory pool's
1054          * vtable which calls r600_buffer_transfer_map() rather than
1055          * this function.
1056          */
1057         assert (!"This function should not be called");
1058 }
1059
1060 void r600_compute_global_transfer_flush_region(
1061         struct pipe_context *ctx_,
1062         struct pipe_transfer *transfer,
1063         const struct pipe_box *box)
1064 {
1065         assert(0 && "TODO");
1066 }
1067
1068 void r600_compute_global_transfer_inline_write(
1069         struct pipe_context *pipe,
1070         struct pipe_resource *resource,
1071         unsigned level,
1072         unsigned usage,
1073         const struct pipe_box *box,
1074         const void *data,
1075         unsigned stride,
1076         unsigned layer_stride)
1077 {
1078         assert(0 && "TODO");
1079 }