src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         r600_mark_atom_dirty(rctx, &state->atom);
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195
 196 void *evergreen_create_compute_state(
 197         struct pipe_context *ctx_,
 198         const const struct pipe_compute_state *cso)
 199 {
 200         struct r600_context *ctx = (struct r600_context *)ctx_;
 201         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const char *code;
 205         void *p;
 206         boolean use_kill;
 207
 208         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211 #if HAVE_LLVM < 0x0306
 212         (void)use_kill;
 213         (void)p;
 214         shader->llvm_ctx = LLVMContextCreate();
 215         shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
 216                                 code, header->num_bytes);
 217         shader->kernels = CALLOC(sizeof(struct r600_kernel),
 218                                 shader->num_kernels);
 219         {
 220                 unsigned i;
 221                 for (i = 0; i < shader->num_kernels; i++) {
 222                         struct r600_kernel *kernel = &shader->kernels[i];
 223                         kernel->llvm_module = radeon_llvm_get_kernel_module(
 224                                 shader->llvm_ctx, i, code, header->num_bytes);
 225                 }
 226         }
 227 #else
 228         memset(&shader->binary, 0, sizeof(shader->binary));
 229         radeon_elf_read(code, header->num_bytes, &shader->binary);
 230         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 231
 232         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 233                                                         shader->bc.ndw * 4);
 234         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 235         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 236         ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf);
 237 #endif
 238 #endif
 239
 240         shader->ctx = ctx;
 241         shader->local_size = cso->req_local_mem;
 242         shader->private_size = cso->req_private_mem;
 243         shader->input_size = cso->req_input_mem;
 244
 245         return shader;
 246 }
 247
 248 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 249 {
 250         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 251
 252         if (!shader)
 253                 return;
 254
 255         FREE(shader);
 256 }
 257
 258 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 259 {
 260         struct r600_context *ctx = (struct r600_context *)ctx_;
 261
 262         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 263
 264         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 265 }
 266
 267 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 268  * kernel parameters there are implicit parameters that need to be stored
 269  * in the vertex buffer as well.  Here is how these parameters are organized in
 270  * the buffer:
 271  *
 272  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 273  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 274  * DWORDS 6-8: Number of work items within each work group in each dimension
 275  *             (x,y,z)
 276  * DWORDS 9+ : Kernel parameters
 277  */
 278 void evergreen_compute_upload_input(
 279         struct pipe_context *ctx_,
 280         const uint *block_layout,
 281         const uint *grid_layout,
 282         const void *input)
 283 {
 284         struct r600_context *ctx = (struct r600_context *)ctx_;
 285         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 286         unsigned i;
 287         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 288          * parameters.
 289          */
 290         unsigned input_size = shader->input_size + 36;
 291         uint32_t * num_work_groups_start;
 292         uint32_t * global_size_start;
 293         uint32_t * local_size_start;
 294         uint32_t * kernel_parameters_start;
 295         struct pipe_box box;
 296         struct pipe_transfer *transfer = NULL;
 297
 298         if (shader->input_size == 0) {
 299                 return;
 300         }
 301
 302         if (!shader->kernel_param) {
 303                 /* Add space for the grid dimensions */
 304                 shader->kernel_param = (struct r600_resource *)
 305                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 306                                         PIPE_USAGE_IMMUTABLE, input_size);
 307         }
 308
 309         u_box_1d(0, input_size, &box);
 310         num_work_groups_start = ctx_->transfer_map(ctx_,
 311                         (struct pipe_resource*)shader->kernel_param,
 312                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 313                         &box, &transfer);
 314         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 315         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 316         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 317
 318         /* Copy the work group size */
 319         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 320
 321         /* Copy the global size */
 322         for (i = 0; i < 3; i++) {
 323                 global_size_start[i] = grid_layout[i] * block_layout[i];
 324         }
 325
 326         /* Copy the local dimensions */
 327         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 328
 329         /* Copy the kernel inputs */
 330         memcpy(kernel_parameters_start, input, shader->input_size);
 331
 332         for (i = 0; i < (input_size / 4); i++) {
 333                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 334                         ((unsigned*)num_work_groups_start)[i]);
 335         }
 336
 337         ctx_->transfer_unmap(ctx_, transfer);
 338
 339         /* ID=0 is reserved for the parameters */
 340         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 341                         (struct pipe_resource*)shader->kernel_param);
 342 }
 343
 344 static void evergreen_emit_direct_dispatch(
 345                 struct r600_context *rctx,
 346                 const uint *block_layout, const uint *grid_layout)
 347 {
 348         int i;
 349         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 350         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 351         unsigned num_waves;
 352         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 353         unsigned wave_divisor = (16 * num_pipes);
 354         int group_size = 1;
 355         int grid_size = 1;
 356         unsigned lds_size = shader->local_size / 4 +
 357 #if HAVE_LLVM < 0x0306
 358                 shader->active_kernel->bc.nlds_dw;
 359 #else
 360                 shader->bc.nlds_dw;
 361 #endif
 362
 363
 364         /* Calculate group_size/grid_size */
 365         for (i = 0; i < 3; i++) {
 366                 group_size *= block_layout[i];
 367         }
 368
 369         for (i = 0; i < 3; i++) {
 370                 grid_size *= grid_layout[i];
 371         }
 372
 373         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 374         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 375                         wave_divisor - 1) / wave_divisor;
 376
 377         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 378                                 "%u wavefronts per thread block, "
 379                                 "allocating %u dwords lds.\n",
 380                                 num_pipes, num_waves, lds_size);
 381
 382         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 383
 384         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 385         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 386         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 387         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 388
 389         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 390                                                                 group_size);
 391
 392         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 393         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 394         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 395         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 396
 397         if (rctx->b.chip_class < CAYMAN) {
 398                 assert(lds_size <= 8192);
 399         } else {
 400                 /* Cayman appears to have a slightly smaller limit, see the
 401                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 402                 assert(lds_size <= 8160);
 403         }
 404
 405         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 406                                         lds_size | (num_waves << 14));
 407
 408         /* Dispatch packet */
 409         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 410         radeon_emit(cs, grid_layout[0]);
 411         radeon_emit(cs, grid_layout[1]);
 412         radeon_emit(cs, grid_layout[2]);
 413         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 414         radeon_emit(cs, 1);
 415 }
 416
 417 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 418                 const uint *grid_layout)
 419 {
 420         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 421         unsigned i;
 422
 423         /* make sure that the gfx ring is only one active */
 424         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 425                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 426         }
 427
 428         /* Initialize all the compute-related registers.
 429          *
 430          * See evergreen_init_atom_start_compute_cs() in this file for the list
 431          * of registers initialized by the start_compute_cs_cmd atom.
 432          */
 433         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 434
 435         /* emit config state */
 436         if (ctx->b.chip_class == EVERGREEN)
 437                 r600_emit_atom(ctx, &ctx->config_state.atom);
 438
 439         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 440         r600_flush_emit(ctx);
 441
 442         /* Emit colorbuffers. */
 443         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 444         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 445                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 446                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 447                                                        (struct r600_resource*)cb->base.texture,
 448                                                        RADEON_USAGE_READWRITE,
 449                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 450
 451                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 452                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 453                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 454                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 455                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 456                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 457                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 458                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 459
 460                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 461                 radeon_emit(cs, reloc);
 462
 463                 if (!ctx->keep_tiling_flags) {
 464                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 465                         radeon_emit(cs, reloc);
 466                 }
 467
 468                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 469                 radeon_emit(cs, reloc);
 470         }
 471         if (ctx->keep_tiling_flags) {
 472                 for (; i < 8 ; i++) {
 473                         radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 474                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 475                 }
 476                 for (; i < 12; i++) {
 477                         radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 478                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 479                 }
 480         }
 481
 482         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 483         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 484                                         ctx->compute_cb_target_mask);
 485
 486
 487         /* Emit vertex buffer state */
 488         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 489         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 490
 491         /* Emit constant buffer state */
 492         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 493
 494         /* Emit sampler state */
 495         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 496
 497         /* Emit sampler view (texture resource) state */
 498         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 499
 500         /* Emit compute shader state */
 501         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 502
 503         /* Emit dispatch state and dispatch packet */
 504         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 505
 506         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 507          */
 508         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 509                       R600_CONTEXT_INV_VERTEX_CACHE |
 510                       R600_CONTEXT_INV_TEX_CACHE;
 511         r600_flush_emit(ctx);
 512         ctx->b.flags = 0;
 513
 514         if (ctx->b.chip_class >= CAYMAN) {
 515                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 516                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 517                 /* DEALLOC_STATE prevents the GPU from hanging when a
 518                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 519                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 520                  */
 521                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 522                 cs->buf[cs->cdw++] = 0;
 523         }
 524
 525 #if 0
 526         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 527         for (i = 0; i < cs->cdw; i++) {
 528                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 529         }
 530 #endif
 531
 532 }
 533
 534
 535 /**
 536  * Emit function for r600_cs_shader_state atom
 537  */
 538 void evergreen_emit_cs_shader(
 539                 struct r600_context *rctx,
 540                 struct r600_atom *atom)
 541 {
 542         struct r600_cs_shader_state *state =
 543                                         (struct r600_cs_shader_state*)atom;
 544         struct r600_pipe_compute *shader = state->shader;
 545         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 546         uint64_t va;
 547         struct r600_resource *code_bo;
 548         unsigned ngpr, nstack;
 549
 550 #if HAVE_LLVM < 0x0306
 551         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 552         code_bo = kernel->code_bo;
 553         va = kernel->code_bo->gpu_address;
 554         ngpr = kernel->bc.ngpr;
 555         nstack = kernel->bc.nstack;
 556 #else
 557         code_bo = shader->code_bo;
 558         va = shader->code_bo->gpu_address + state->pc;
 559         ngpr = shader->bc.ngpr;
 560         nstack = shader->bc.nstack;
 561 #endif
 562
 563         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 564         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 565         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 566                         S_0288D4_NUM_GPRS(ngpr)
 567                         | S_0288D4_STACK_SIZE(nstack));
 568         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 569
 570         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 571         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 572                                               code_bo, RADEON_USAGE_READ,
 573                                               RADEON_PRIO_USER_SHADER));
 574 }
 575
 576 static void evergreen_launch_grid(
 577                 struct pipe_context *ctx_,
 578                 const uint *block_layout, const uint *grid_layout,
 579                 uint32_t pc, const void *input)
 580 {
 581         struct r600_context *ctx = (struct r600_context *)ctx_;
 582 #ifdef HAVE_OPENCL
 583         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 584         boolean use_kill;
 585
 586 #if HAVE_LLVM < 0x0306
 587         struct r600_kernel *kernel = &shader->kernels[pc];
 588         (void)use_kill;
 589         if (!kernel->code_bo) {
 590                 void *p;
 591                 struct r600_bytecode *bc = &kernel->bc;
 592                 LLVMModuleRef mod = kernel->llvm_module;
 593                 boolean use_kill = false;
 594                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 595                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 596                 unsigned sb_disasm = use_sb ||
 597                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 598
 599                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 600                            ctx->screen->has_compressed_msaa_texturing);
 601                 bc->type = TGSI_PROCESSOR_COMPUTE;
 602                 bc->isa = ctx->isa;
 603                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 604
 605                 if (dump && !sb_disasm) {
 606                         r600_bytecode_disasm(bc);
 607                 } else if ((dump && sb_disasm) || use_sb) {
 608                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 609                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 610                 }
 611
 612                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 613                                                         kernel->bc.ndw * 4);
 614                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 615                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 616                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 617         }
 618         shader->active_kernel = kernel;
 619         ctx->cs_shader_state.kernel_index = pc;
 620 #else
 621         ctx->cs_shader_state.pc = pc;
 622         /* Get the config information for this kernel. */
 623         r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
 624 #endif
 625 #endif
 626
 627         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 628
 629
 630         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 631         compute_emit_cs(ctx, block_layout, grid_layout);
 632 }
 633
 634 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 635                 unsigned start, unsigned count,
 636                 struct pipe_surface ** surfaces)
 637 {
 638         struct r600_context *ctx = (struct r600_context *)ctx_;
 639         struct r600_surface **resources = (struct r600_surface **)surfaces;
 640
 641         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 642                         start, count);
 643
 644         for (unsigned i = 0; i < count; i++) {
 645                 /* The First two vertex buffers are reserved for parameters and
 646                  * global buffers. */
 647                 unsigned vtx_id = 2 + i;
 648                 if (resources[i]) {
 649                         struct r600_resource_global *buffer =
 650                                 (struct r600_resource_global*)
 651                                 resources[i]->base.texture;
 652                         if (resources[i]->base.writable) {
 653                                 assert(i+1 < 12);
 654
 655                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 656                                 (struct r600_resource *)resources[i]->base.texture,
 657                                 buffer->chunk->start_in_dw*4,
 658                                 resources[i]->base.texture->width0);
 659                         }
 660
 661                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 662                                         buffer->chunk->start_in_dw * 4,
 663                                         resources[i]->base.texture);
 664                 }
 665         }
 666 }
 667
 668 static void evergreen_set_global_binding(
 669         struct pipe_context *ctx_, unsigned first, unsigned n,
 670         struct pipe_resource **resources,
 671         uint32_t **handles)
 672 {
 673         struct r600_context *ctx = (struct r600_context *)ctx_;
 674         struct compute_memory_pool *pool = ctx->screen->global_pool;
 675         struct r600_resource_global **buffers =
 676                 (struct r600_resource_global **)resources;
 677         unsigned i;
 678
 679         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 680                         first, n);
 681
 682         if (!resources) {
 683                 /* XXX: Unset */
 684                 return;
 685         }
 686
 687         /* We mark these items for promotion to the pool if they
 688          * aren't already there */
 689         for (i = first; i < first + n; i++) {
 690                 struct compute_memory_item *item = buffers[i]->chunk;
 691
 692                 if (!is_item_in_pool(item))
 693                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 694         }
 695
 696         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 697                 /* XXX: Unset */
 698                 return;
 699         }
 700
 701         for (i = first; i < first + n; i++)
 702         {
 703                 uint32_t buffer_offset;
 704                 uint32_t handle;
 705                 assert(resources[i]->target == PIPE_BUFFER);
 706                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 707
 708                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 709                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 710
 711                 *(handles[i]) = util_cpu_to_le32(handle);
 712         }
 713
 714         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 715         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 716                                 (struct pipe_resource*)pool->bo);
 717 }
 718
 719 /**
 720  * This function initializes all the compute specific registers that need to
 721  * be initialized for each compute command stream.  Registers that are common
 722  * to both compute and 3D will be initialized at the beginning of each compute
 723  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 724  * packet requires that the shader type bit be set, we must initialize all
 725  * context registers needed for compute in this function.  The registers
 726  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 727  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 728  * on the GPU family.
 729  */
 730 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 731 {
 732         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 733         int num_threads;
 734         int num_stack_entries;
 735
 736         /* since all required registers are initialised in the
 737          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 738          */
 739         r600_init_command_buffer(cb, 256);
 740         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 741
 742         /* This must be first. */
 743         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 744         r600_store_value(cb, 0x80000000);
 745         r600_store_value(cb, 0x80000000);
 746
 747         /* We're setting config registers here. */
 748         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 749         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 750
 751         switch (ctx->b.family) {
 752         case CHIP_CEDAR:
 753         default:
 754                 num_threads = 128;
 755                 num_stack_entries = 256;
 756                 break;
 757         case CHIP_REDWOOD:
 758                 num_threads = 128;
 759                 num_stack_entries = 256;
 760                 break;
 761         case CHIP_JUNIPER:
 762                 num_threads = 128;
 763                 num_stack_entries = 512;
 764                 break;
 765         case CHIP_CYPRESS:
 766         case CHIP_HEMLOCK:
 767                 num_threads = 128;
 768                 num_stack_entries = 512;
 769                 break;
 770         case CHIP_PALM:
 771                 num_threads = 128;
 772                 num_stack_entries = 256;
 773                 break;
 774         case CHIP_SUMO:
 775                 num_threads = 128;
 776                 num_stack_entries = 256;
 777                 break;
 778         case CHIP_SUMO2:
 779                 num_threads = 128;
 780                 num_stack_entries = 512;
 781                 break;
 782         case CHIP_BARTS:
 783                 num_threads = 128;
 784                 num_stack_entries = 512;
 785                 break;
 786         case CHIP_TURKS:
 787                 num_threads = 128;
 788                 num_stack_entries = 256;
 789                 break;
 790         case CHIP_CAICOS:
 791                 num_threads = 128;
 792                 num_stack_entries = 256;
 793                 break;
 794         }
 795
 796         /* Config Registers */
 797         if (ctx->b.chip_class < CAYMAN)
 798                 evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
 799                                            ctx->screen->b.info.drm_minor);
 800         else
 801                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 802                                         ctx->screen->b.info.drm_minor);
 803
 804         /* The primitive type always needs to be POINTLIST for compute. */
 805         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 806                                                 V_008958_DI_PT_POINTLIST);
 807
 808         if (ctx->b.chip_class < CAYMAN) {
 809
 810                 /* These registers control which simds can be used by each stage.
 811                  * The default for these registers is 0xffffffff, which means
 812                  * all simds are available for each stage.  It's possible we may
 813                  * want to play around with these in the future, but for now
 814                  * the default value is fine.
 815                  *
 816                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 817                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 818                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 819                  */
 820
 821                 /* XXX: We may need to adjust the thread and stack resouce
 822                  * values for 3D/compute interop */
 823
 824                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 825
 826                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 827                  * Set the number of threads used by the PS/VS/GS/ES stage to
 828                  * 0.
 829                  */
 830                 r600_store_value(cb, 0);
 831
 832                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 833                  * Set the number of threads used by the CS (aka LS) stage to
 834                  * the maximum number of threads and set the number of threads
 835                  * for the HS stage to 0. */
 836                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 837
 838                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 839                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 840                 r600_store_value(cb, 0);
 841
 842                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 843                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 844                 r600_store_value(cb, 0);
 845
 846                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 847                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 848                  * set it to the maximum value for the CS (aka LS) stage. */
 849                 r600_store_value(cb,
 850                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 851         }
 852         /* Give the compute shader all the available LDS space.
 853          * NOTE: This only sets the maximum number of dwords that a compute
 854          * shader can allocate.  When a shader is executed, we still need to
 855          * allocate the appropriate amount of LDS dwords using the
 856          * CM_R_0288E8_SQ_LDS_ALLOC register.
 857          */
 858         if (ctx->b.chip_class < CAYMAN) {
 859                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 860                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 861         } else {
 862                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 863                         S_0286FC_NUM_PS_LDS(0) |
 864                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 865         }
 866
 867         /* Context Registers */
 868
 869         if (ctx->b.chip_class < CAYMAN) {
 870                 /* workaround for hw issues with dyn gpr - must set all limits
 871                  * to 240 instead of 0, 0x1e == 240 / 8
 872                  */
 873                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 874                                 S_028838_PS_GPRS(0x1e) |
 875                                 S_028838_VS_GPRS(0x1e) |
 876                                 S_028838_GS_GPRS(0x1e) |
 877                                 S_028838_ES_GPRS(0x1e) |
 878                                 S_028838_HS_GPRS(0x1e) |
 879                                 S_028838_LS_GPRS(0x1e));
 880         }
 881
 882         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 883         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 884                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 885
 886         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 887
 888         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 889                                                 S_0286E8_TID_IN_GROUP_ENA
 890                                                 | S_0286E8_TGID_ENA
 891                                                 | S_0286E8_DISABLE_INDEX_PACK)
 892                                                 ;
 893
 894         /* The LOOP_CONST registers are an optimizations for loops that allows
 895          * you to store the initial counter, increment value, and maximum
 896          * counter value in a register so that hardware can calculate the
 897          * correct number of iterations for the loop, so that you don't need
 898          * to have the loop counter in your shader code.  We don't currently use
 899          * this optimization, so we must keep track of the counter in the
 900          * shader and use a break instruction to exit loops.  However, the
 901          * hardware will still uses this register to determine when to exit a
 902          * loop, so we need to initialize the counter to 0, set the increment
 903          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 904          * is the maximum value allowed.  This gives us a maximum of 4096
 905          * iterations for our loops, but hopefully our break instruction will
 906          * execute before some time before the 4096th iteration.
 907          */
 908         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 909 }
 910
 911 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 912 {
 913         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 914         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 915         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 916 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 917         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 918         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 919         ctx->b.b.launch_grid = evergreen_launch_grid;
 920
 921 }
 922
 923 struct pipe_resource *r600_compute_global_buffer_create(
 924         struct pipe_screen *screen,
 925         const struct pipe_resource *templ)
 926 {
 927         struct r600_resource_global* result = NULL;
 928         struct r600_screen* rscreen = NULL;
 929         int size_in_dw = 0;
 930
 931         assert(templ->target == PIPE_BUFFER);
 932         assert(templ->bind & PIPE_BIND_GLOBAL);
 933         assert(templ->array_size == 1 || templ->array_size == 0);
 934         assert(templ->depth0 == 1 || templ->depth0 == 0);
 935         assert(templ->height0 == 1 || templ->height0 == 0);
 936
 937         result = (struct r600_resource_global*)
 938         CALLOC(sizeof(struct r600_resource_global), 1);
 939         rscreen = (struct r600_screen*)screen;
 940
 941         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 942         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 943                         templ->array_size);
 944
 945         result->base.b.vtbl = &r600_global_buffer_vtbl;
 946         result->base.b.b.screen = screen;
 947         result->base.b.b = *templ;
 948         pipe_reference_init(&result->base.b.b.reference, 1);
 949
 950         size_in_dw = (templ->width0+3) / 4;
 951
 952         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 953
 954         if (result->chunk == NULL)
 955         {
 956                 free(result);
 957                 return NULL;
 958         }
 959
 960         return &result->base.b.b;
 961 }
 962
 963 void r600_compute_global_buffer_destroy(
 964         struct pipe_screen *screen,
 965         struct pipe_resource *res)
 966 {
 967         struct r600_resource_global* buffer = NULL;
 968         struct r600_screen* rscreen = NULL;
 969
 970         assert(res->target == PIPE_BUFFER);
 971         assert(res->bind & PIPE_BIND_GLOBAL);
 972
 973         buffer = (struct r600_resource_global*)res;
 974         rscreen = (struct r600_screen*)screen;
 975
 976         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 977
 978         buffer->chunk = NULL;
 979         free(res);
 980 }
 981
 982 void *r600_compute_global_transfer_map(
 983         struct pipe_context *ctx_,
 984         struct pipe_resource *resource,
 985         unsigned level,
 986         unsigned usage,
 987         const struct pipe_box *box,
 988         struct pipe_transfer **ptransfer)
 989 {
 990         struct r600_context *rctx = (struct r600_context*)ctx_;
 991         struct compute_memory_pool *pool = rctx->screen->global_pool;
 992         struct r600_resource_global* buffer =
 993                 (struct r600_resource_global*)resource;
 994
 995         struct compute_memory_item *item = buffer->chunk;
 996         struct pipe_resource *dst = NULL;
 997         unsigned offset = box->x;
 998
 999         if (is_item_in_pool(item)) {
1000                 compute_memory_demote_item(pool, item, ctx_);
1001         }
1002         else {
1003                 if (item->real_buffer == NULL) {
1004                         item->real_buffer =
1005                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1006                 }
1007         }
1008
1009         dst = (struct pipe_resource*)item->real_buffer;
1010
1011         if (usage & PIPE_TRANSFER_READ)
1012                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1013
1014         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1015                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1016                         "width = %u, height = %u, depth = %u)\n", level, usage,
1017                         box->x, box->y, box->z, box->width, box->height,
1018                         box->depth);
1019         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1020                 "%u (box.x)\n", item->id, box->x);
1021
1022
1023         assert(resource->target == PIPE_BUFFER);
1024         assert(resource->bind & PIPE_BIND_GLOBAL);
1025         assert(box->x >= 0);
1026         assert(box->y == 0);
1027         assert(box->z == 0);
1028
1029         ///TODO: do it better, mapping is not possible if the pool is too big
1030         return pipe_buffer_map_range(ctx_, dst,
1031                         offset, box->width, usage, ptransfer);
1032 }
1033
1034 void r600_compute_global_transfer_unmap(
1035         struct pipe_context *ctx_,
1036         struct pipe_transfer* transfer)
1037 {
1038         /* struct r600_resource_global are not real resources, they just map
1039          * to an offset within the compute memory pool.  The function
1040          * r600_compute_global_transfer_map() maps the memory pool
1041          * resource rather than the struct r600_resource_global passed to
1042          * it as an argument and then initalizes ptransfer->resource with
1043          * the memory pool resource (via pipe_buffer_map_range).
1044          * When transfer_unmap is called it uses the memory pool's
1045          * vtable which calls r600_buffer_transfer_map() rather than
1046          * this function.
1047          */
1048         assert (!"This function should not be called");
1049 }
1050
1051 void r600_compute_global_transfer_flush_region(
1052         struct pipe_context *ctx_,
1053         struct pipe_transfer *transfer,
1054         const struct pipe_box *box)
1055 {
1056         assert(0 && "TODO");
1057 }
1058
1059 void r600_compute_global_transfer_inline_write(
1060         struct pipe_context *pipe,
1061         struct pipe_resource *resource,
1062         unsigned level,
1063         unsigned usage,
1064         const struct pipe_box *box,
1065         const void *data,
1066         unsigned stride,
1067         unsigned layer_stride)
1068 {
1069         assert(0 && "TODO");
1070 }