src/gallium/drivers/r600/evergreen_compute.c

   1 /*
   2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Adam Rak <adam.rak@streamnovation.com>
  25  */
  26
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include "pipe/p_defines.h"
  30 #include "pipe/p_state.h"
  31 #include "pipe/p_context.h"
  32 #include "util/u_blitter.h"
  33 #include "util/list.h"
  34 #include "util/u_transfer.h"
  35 #include "util/u_surface.h"
  36 #include "util/u_pack_color.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_inlines.h"
  39 #include "util/u_framebuffer.h"
  40 #include "pipebuffer/pb_buffer.h"
  41 #include "evergreend.h"
  42 #include "r600_shader.h"
  43 #include "r600_pipe.h"
  44 #include "r600_formats.h"
  45 #include "evergreen_compute.h"
  46 #include "evergreen_compute_internal.h"
  47 #include "compute_memory_pool.h"
  48 #include "sb/sb_public.h"
  49 #ifdef HAVE_OPENCL
  50 #include "radeon/radeon_llvm_util.h"
  51 #endif
  52 #include "radeon/radeon_elf_util.h"
  53 #include <inttypes.h>
  54
  55 /**
  56 RAT0 is for global binding write
  57 VTX1 is for global binding read
  58
  59 for wrting images RAT1...
  60 for reading images TEX2...
  61   TEX2-RAT1 is paired
  62
  63 TEX2... consumes the same fetch resources, that VTX2... would consume
  64
  65 CONST0 and VTX0 is for parameters
  66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
  67   also constant cached
  68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
  69   the constant cache can handle
  70
  71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
  72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
  73 we should reserve another one too.=> 10 image binding for writing max.
  74
  75 from Nvidia OpenCL:
  76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  78
  79 so 10 for writing is enough. 176 is the max for reading according to the docs
  80
  81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
  82 writable images will consume TEX slots, VTX slots too because of linear indexing
  83
  84 */
  85
  86 struct r600_resource* r600_compute_buffer_alloc_vram(
  87        struct r600_screen *screen,
  88        unsigned size)
  89 {
  90         struct pipe_resource * buffer = NULL;
  91         assert(size);
  92
  93         buffer = pipe_buffer_create(
  94                 (struct pipe_screen*) screen,
  95                 PIPE_BIND_CUSTOM,
  96                 PIPE_USAGE_IMMUTABLE,
  97                 size);
  98
  99         return (struct r600_resource *)buffer;
 100 }
 101
 102
 103 static void evergreen_set_rat(
 104         struct r600_pipe_compute *pipe,
 105         unsigned id,
 106         struct r600_resource* bo,
 107         int start,
 108         int size)
 109 {
 110         struct pipe_surface rat_templ;
 111         struct r600_surface *surf = NULL;
 112         struct r600_context *rctx = NULL;
 113
 114         assert(id < 12);
 115         assert((size & 3) == 0);
 116         assert((start & 0xFF) == 0);
 117
 118         rctx = pipe->ctx;
 119
 120         COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
 121
 122         /* Create the RAT surface */
 123         memset(&rat_templ, 0, sizeof(rat_templ));
 124         rat_templ.format = PIPE_FORMAT_R32_UINT;
 125         rat_templ.u.tex.level = 0;
 126         rat_templ.u.tex.first_layer = 0;
 127         rat_templ.u.tex.last_layer = 0;
 128
 129         /* Add the RAT the list of color buffers */
 130         pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
 131                 (struct pipe_context *)pipe->ctx,
 132                 (struct pipe_resource *)bo, &rat_templ);
 133
 134         /* Update the number of color buffers */
 135         pipe->ctx->framebuffer.state.nr_cbufs =
 136                 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
 137
 138         /* Update the cb_target_mask
 139          * XXX: I think this is a potential spot for bugs once we start doing
 140          * GL interop.  cb_target_mask may be modified in the 3D sections
 141          * of this driver. */
 142         pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
 143
 144         surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
 145         evergreen_init_color_surface_rat(rctx, surf);
 146 }
 147
 148 static void evergreen_cs_set_vertex_buffer(
 149         struct r600_context * rctx,
 150         unsigned vb_index,
 151         unsigned offset,
 152         struct pipe_resource * buffer)
 153 {
 154         struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 155         struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 156         vb->stride = 1;
 157         vb->buffer_offset = offset;
 158         vb->buffer = buffer;
 159         vb->user_buffer = NULL;
 160
 161         /* The vertex instructions in the compute shaders use the texture cache,
 162          * so we need to invalidate it. */
 163         rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 164         state->enabled_mask |= 1 << vb_index;
 165         state->dirty_mask |= 1 << vb_index;
 166         r600_mark_atom_dirty(rctx, &state->atom);
 167 }
 168
 169 static void evergreen_cs_set_constant_buffer(
 170         struct r600_context * rctx,
 171         unsigned cb_index,
 172         unsigned offset,
 173         unsigned size,
 174         struct pipe_resource * buffer)
 175 {
 176         struct pipe_constant_buffer cb;
 177         cb.buffer_size = size;
 178         cb.buffer_offset = offset;
 179         cb.buffer = buffer;
 180         cb.user_buffer = NULL;
 181
 182         rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 183 }
 184
 185 static const struct u_resource_vtbl r600_global_buffer_vtbl =
 186 {
 187         u_default_resource_get_handle, /* get_handle */
 188         r600_compute_global_buffer_destroy, /* resource_destroy */
 189         r600_compute_global_transfer_map, /* transfer_map */
 190         r600_compute_global_transfer_flush_region,/* transfer_flush_region */
 191         r600_compute_global_transfer_unmap, /* transfer_unmap */
 192         r600_compute_global_transfer_inline_write /* transfer_inline_write */
 193 };
 194
 195
 196 void *evergreen_create_compute_state(
 197         struct pipe_context *ctx_,
 198         const const struct pipe_compute_state *cso)
 199 {
 200         struct r600_context *ctx = (struct r600_context *)ctx_;
 201         struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 202 #ifdef HAVE_OPENCL
 203         const struct pipe_llvm_program_header * header;
 204         const char *code;
 205         void *p;
 206         boolean use_kill;
 207
 208         COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
 209         header = cso->prog;
 210         code = cso->prog + sizeof(struct pipe_llvm_program_header);
 211 #if HAVE_LLVM < 0x0306
 212         (void)use_kill;
 213         (void)p;
 214         shader->llvm_ctx = LLVMContextCreate();
 215         shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
 216                                 code, header->num_bytes);
 217         shader->kernels = CALLOC(sizeof(struct r600_kernel),
 218                                 shader->num_kernels);
 219         {
 220                 unsigned i;
 221                 for (i = 0; i < shader->num_kernels; i++) {
 222                         struct r600_kernel *kernel = &shader->kernels[i];
 223                         kernel->llvm_module = radeon_llvm_get_kernel_module(
 224                                 shader->llvm_ctx, i, code, header->num_bytes);
 225                 }
 226         }
 227 #else
 228         memset(&shader->binary, 0, sizeof(shader->binary));
 229         radeon_elf_read(code, header->num_bytes, &shader->binary);
 230         r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 231
 232         shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 233                                                         shader->bc.ndw * 4);
 234         p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 235         memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
 236         ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf);
 237 #endif
 238 #endif
 239
 240         shader->ctx = ctx;
 241         shader->local_size = cso->req_local_mem;
 242         shader->private_size = cso->req_private_mem;
 243         shader->input_size = cso->req_input_mem;
 244
 245         return shader;
 246 }
 247
 248 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
 249 {
 250         struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
 251
 252         if (!shader)
 253                 return;
 254
 255         FREE(shader);
 256 }
 257
 258 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 259 {
 260         struct r600_context *ctx = (struct r600_context *)ctx_;
 261
 262         COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
 263
 264         ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 265 }
 266
 267 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
 268  * kernel parameters there are implicit parameters that need to be stored
 269  * in the vertex buffer as well.  Here is how these parameters are organized in
 270  * the buffer:
 271  *
 272  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
 273  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
 274  * DWORDS 6-8: Number of work items within each work group in each dimension
 275  *             (x,y,z)
 276  * DWORDS 9+ : Kernel parameters
 277  */
 278 void evergreen_compute_upload_input(
 279         struct pipe_context *ctx_,
 280         const uint *block_layout,
 281         const uint *grid_layout,
 282         const void *input)
 283 {
 284         struct r600_context *ctx = (struct r600_context *)ctx_;
 285         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 286         unsigned i;
 287         /* We need to reserve 9 dwords (36 bytes) for implicit kernel
 288          * parameters.
 289          */
 290         unsigned input_size = shader->input_size + 36;
 291         uint32_t * num_work_groups_start;
 292         uint32_t * global_size_start;
 293         uint32_t * local_size_start;
 294         uint32_t * kernel_parameters_start;
 295         struct pipe_box box;
 296         struct pipe_transfer *transfer = NULL;
 297
 298         if (shader->input_size == 0) {
 299                 return;
 300         }
 301
 302         if (!shader->kernel_param) {
 303                 /* Add space for the grid dimensions */
 304                 shader->kernel_param = (struct r600_resource *)
 305                         pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
 306                                         PIPE_USAGE_IMMUTABLE, input_size);
 307         }
 308
 309         u_box_1d(0, input_size, &box);
 310         num_work_groups_start = ctx_->transfer_map(ctx_,
 311                         (struct pipe_resource*)shader->kernel_param,
 312                         0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 313                         &box, &transfer);
 314         global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
 315         local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
 316         kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 317
 318         /* Copy the work group size */
 319         memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
 320
 321         /* Copy the global size */
 322         for (i = 0; i < 3; i++) {
 323                 global_size_start[i] = grid_layout[i] * block_layout[i];
 324         }
 325
 326         /* Copy the local dimensions */
 327         memcpy(local_size_start, block_layout, 3 * sizeof(uint));
 328
 329         /* Copy the kernel inputs */
 330         memcpy(kernel_parameters_start, input, shader->input_size);
 331
 332         for (i = 0; i < (input_size / 4); i++) {
 333                 COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
 334                         ((unsigned*)num_work_groups_start)[i]);
 335         }
 336
 337         ctx_->transfer_unmap(ctx_, transfer);
 338
 339         /* ID=0 is reserved for the parameters */
 340         evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
 341                         (struct pipe_resource*)shader->kernel_param);
 342 }
 343
 344 static void evergreen_emit_direct_dispatch(
 345                 struct r600_context *rctx,
 346                 const uint *block_layout, const uint *grid_layout)
 347 {
 348         int i;
 349         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 350         struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 351         unsigned num_waves;
 352         unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
 353         unsigned wave_divisor = (16 * num_pipes);
 354         int group_size = 1;
 355         int grid_size = 1;
 356         unsigned lds_size = shader->local_size / 4 +
 357 #if HAVE_LLVM < 0x0306
 358                 shader->active_kernel->bc.nlds_dw;
 359 #else
 360                 shader->bc.nlds_dw;
 361 #endif
 362
 363
 364         /* Calculate group_size/grid_size */
 365         for (i = 0; i < 3; i++) {
 366                 group_size *= block_layout[i];
 367         }
 368
 369         for (i = 0; i < 3; i++) {
 370                 grid_size *= grid_layout[i];
 371         }
 372
 373         /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
 374         num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
 375                         wave_divisor - 1) / wave_divisor;
 376
 377         COMPUTE_DBG(rctx->screen, "Using %u pipes, "
 378                                 "%u wavefronts per thread block, "
 379                                 "allocating %u dwords lds.\n",
 380                                 num_pipes, num_waves, lds_size);
 381
 382         radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
 383
 384         radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
 385         radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
 386         radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
 387         radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
 388
 389         radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
 390                                                                 group_size);
 391
 392         radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
 393         radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
 394         radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
 395         radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 396
 397         if (rctx->b.chip_class < CAYMAN) {
 398                 assert(lds_size <= 8192);
 399         } else {
 400                 /* Cayman appears to have a slightly smaller limit, see the
 401                  * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
 402                 assert(lds_size <= 8160);
 403         }
 404
 405         radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
 406                                         lds_size | (num_waves << 14));
 407
 408         /* Dispatch packet */
 409         radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
 410         radeon_emit(cs, grid_layout[0]);
 411         radeon_emit(cs, grid_layout[1]);
 412         radeon_emit(cs, grid_layout[2]);
 413         /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 414         radeon_emit(cs, 1);
 415 }
 416
 417 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 418                 const uint *grid_layout)
 419 {
 420         struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 421         unsigned i;
 422
 423         /* make sure that the gfx ring is only one active */
 424         if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
 425                 ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 426         }
 427
 428         /* Initialize all the compute-related registers.
 429          *
 430          * See evergreen_init_atom_start_compute_cs() in this file for the list
 431          * of registers initialized by the start_compute_cs_cmd atom.
 432          */
 433         r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
 434
 435         ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
 436         r600_flush_emit(ctx);
 437
 438         /* Emit colorbuffers. */
 439         /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 440         for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 441                 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
 442                 unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 443                                                        (struct r600_resource*)cb->base.texture,
 444                                                        RADEON_USAGE_READWRITE,
 445                                                        RADEON_PRIO_SHADER_RW_BUFFER);
 446
 447                 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
 448                 radeon_emit(cs, cb->cb_color_base);     /* R_028C60_CB_COLOR0_BASE */
 449                 radeon_emit(cs, cb->cb_color_pitch);    /* R_028C64_CB_COLOR0_PITCH */
 450                 radeon_emit(cs, cb->cb_color_slice);    /* R_028C68_CB_COLOR0_SLICE */
 451                 radeon_emit(cs, cb->cb_color_view);     /* R_028C6C_CB_COLOR0_VIEW */
 452                 radeon_emit(cs, cb->cb_color_info);     /* R_028C70_CB_COLOR0_INFO */
 453                 radeon_emit(cs, cb->cb_color_attrib);   /* R_028C74_CB_COLOR0_ATTRIB */
 454                 radeon_emit(cs, cb->cb_color_dim);              /* R_028C78_CB_COLOR0_DIM */
 455
 456                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
 457                 radeon_emit(cs, reloc);
 458
 459                 if (!ctx->keep_tiling_flags) {
 460                         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
 461                         radeon_emit(cs, reloc);
 462                 }
 463
 464                 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
 465                 radeon_emit(cs, reloc);
 466         }
 467         if (ctx->keep_tiling_flags) {
 468                 for (; i < 8 ; i++) {
 469                         radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 470                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 471                 }
 472                 for (; i < 12; i++) {
 473                         radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
 474                                                        S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 475                 }
 476         }
 477
 478         /* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 479         radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 480                                         ctx->compute_cb_target_mask);
 481
 482
 483         /* Emit vertex buffer state */
 484         ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
 485         r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
 486
 487         /* Emit constant buffer state */
 488         r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 489
 490         /* Emit sampler state */
 491         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 492
 493         /* Emit sampler view (texture resource) state */
 494         r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 495
 496         /* Emit compute shader state */
 497         r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 498
 499         /* Emit dispatch state and dispatch packet */
 500         evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
 501
 502         /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 503          */
 504         ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 505                       R600_CONTEXT_INV_VERTEX_CACHE |
 506                       R600_CONTEXT_INV_TEX_CACHE;
 507         r600_flush_emit(ctx);
 508         ctx->b.flags = 0;
 509
 510         if (ctx->b.chip_class >= CAYMAN) {
 511                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 512                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 513                 /* DEALLOC_STATE prevents the GPU from hanging when a
 514                  * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 515                  * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 516                  */
 517                 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
 518                 cs->buf[cs->cdw++] = 0;
 519         }
 520
 521 #if 0
 522         COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
 523         for (i = 0; i < cs->cdw; i++) {
 524                 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 525         }
 526 #endif
 527
 528 }
 529
 530
 531 /**
 532  * Emit function for r600_cs_shader_state atom
 533  */
 534 void evergreen_emit_cs_shader(
 535                 struct r600_context *rctx,
 536                 struct r600_atom *atom)
 537 {
 538         struct r600_cs_shader_state *state =
 539                                         (struct r600_cs_shader_state*)atom;
 540         struct r600_pipe_compute *shader = state->shader;
 541         struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 542         uint64_t va;
 543         struct r600_resource *code_bo;
 544         unsigned ngpr, nstack;
 545
 546 #if HAVE_LLVM < 0x0306
 547         struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
 548         code_bo = kernel->code_bo;
 549         va = kernel->code_bo->gpu_address;
 550         ngpr = kernel->bc.ngpr;
 551         nstack = kernel->bc.nstack;
 552 #else
 553         code_bo = shader->code_bo;
 554         va = shader->code_bo->gpu_address + state->pc;
 555         ngpr = shader->bc.ngpr;
 556         nstack = shader->bc.nstack;
 557 #endif
 558
 559         radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
 560         radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
 561         radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
 562                         S_0288D4_NUM_GPRS(ngpr)
 563                         | S_0288D4_STACK_SIZE(nstack));
 564         radeon_emit(cs, 0);     /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 565
 566         radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
 567         radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 568                                               code_bo, RADEON_USAGE_READ,
 569                                               RADEON_PRIO_USER_SHADER));
 570 }
 571
 572 static void evergreen_launch_grid(
 573                 struct pipe_context *ctx_,
 574                 const uint *block_layout, const uint *grid_layout,
 575                 uint32_t pc, const void *input)
 576 {
 577         struct r600_context *ctx = (struct r600_context *)ctx_;
 578 #ifdef HAVE_OPENCL
 579         struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
 580         boolean use_kill;
 581
 582 #if HAVE_LLVM < 0x0306
 583         struct r600_kernel *kernel = &shader->kernels[pc];
 584         (void)use_kill;
 585         if (!kernel->code_bo) {
 586                 void *p;
 587                 struct r600_bytecode *bc = &kernel->bc;
 588                 LLVMModuleRef mod = kernel->llvm_module;
 589                 boolean use_kill = false;
 590                 bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
 591                 unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
 592                 unsigned sb_disasm = use_sb ||
 593                         (ctx->screen->b.debug_flags & DBG_SB_DISASM);
 594
 595                 r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
 596                            ctx->screen->has_compressed_msaa_texturing);
 597                 bc->type = TGSI_PROCESSOR_COMPUTE;
 598                 bc->isa = ctx->isa;
 599                 r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
 600
 601                 if (dump && !sb_disasm) {
 602                         r600_bytecode_disasm(bc);
 603                 } else if ((dump && sb_disasm) || use_sb) {
 604                         if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
 605                                 R600_ERR("r600_sb_bytecode_process failed!\n");
 606                 }
 607
 608                 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
 609                                                         kernel->bc.ndw * 4);
 610                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
 611                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
 612                 ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
 613         }
 614         shader->active_kernel = kernel;
 615         ctx->cs_shader_state.kernel_index = pc;
 616 #else
 617         ctx->cs_shader_state.pc = pc;
 618         /* Get the config information for this kernel. */
 619         r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
 620 #endif
 621 #endif
 622
 623         COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
 624
 625
 626         evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
 627         compute_emit_cs(ctx, block_layout, grid_layout);
 628 }
 629
 630 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 631                 unsigned start, unsigned count,
 632                 struct pipe_surface ** surfaces)
 633 {
 634         struct r600_context *ctx = (struct r600_context *)ctx_;
 635         struct r600_surface **resources = (struct r600_surface **)surfaces;
 636
 637         COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 638                         start, count);
 639
 640         for (unsigned i = 0; i < count; i++) {
 641                 /* The First two vertex buffers are reserved for parameters and
 642                  * global buffers. */
 643                 unsigned vtx_id = 2 + i;
 644                 if (resources[i]) {
 645                         struct r600_resource_global *buffer =
 646                                 (struct r600_resource_global*)
 647                                 resources[i]->base.texture;
 648                         if (resources[i]->base.writable) {
 649                                 assert(i+1 < 12);
 650
 651                                 evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
 652                                 (struct r600_resource *)resources[i]->base.texture,
 653                                 buffer->chunk->start_in_dw*4,
 654                                 resources[i]->base.texture->width0);
 655                         }
 656
 657                         evergreen_cs_set_vertex_buffer(ctx, vtx_id,
 658                                         buffer->chunk->start_in_dw * 4,
 659                                         resources[i]->base.texture);
 660                 }
 661         }
 662 }
 663
 664 static void evergreen_set_global_binding(
 665         struct pipe_context *ctx_, unsigned first, unsigned n,
 666         struct pipe_resource **resources,
 667         uint32_t **handles)
 668 {
 669         struct r600_context *ctx = (struct r600_context *)ctx_;
 670         struct compute_memory_pool *pool = ctx->screen->global_pool;
 671         struct r600_resource_global **buffers =
 672                 (struct r600_resource_global **)resources;
 673         unsigned i;
 674
 675         COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 676                         first, n);
 677
 678         if (!resources) {
 679                 /* XXX: Unset */
 680                 return;
 681         }
 682
 683         /* We mark these items for promotion to the pool if they
 684          * aren't already there */
 685         for (i = first; i < first + n; i++) {
 686                 struct compute_memory_item *item = buffers[i]->chunk;
 687
 688                 if (!is_item_in_pool(item))
 689                         buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 690         }
 691
 692         if (compute_memory_finalize_pending(pool, ctx_) == -1) {
 693                 /* XXX: Unset */
 694                 return;
 695         }
 696
 697         for (i = first; i < first + n; i++)
 698         {
 699                 uint32_t buffer_offset;
 700                 uint32_t handle;
 701                 assert(resources[i]->target == PIPE_BUFFER);
 702                 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
 703
 704                 buffer_offset = util_le32_to_cpu(*(handles[i]));
 705                 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
 706
 707                 *(handles[i]) = util_cpu_to_le32(handle);
 708         }
 709
 710         evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
 711         evergreen_cs_set_vertex_buffer(ctx, 1, 0,
 712                                 (struct pipe_resource*)pool->bo);
 713 }
 714
 715 /**
 716  * This function initializes all the compute specific registers that need to
 717  * be initialized for each compute command stream.  Registers that are common
 718  * to both compute and 3D will be initialized at the beginning of each compute
 719  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 720  * packet requires that the shader type bit be set, we must initialize all
 721  * context registers needed for compute in this function.  The registers
 722  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 723  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 724  * on the GPU family.
 725  */
 726 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 727 {
 728         struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
 729         int num_threads;
 730         int num_stack_entries;
 731
 732         /* since all required registers are initialised in the
 733          * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 734          */
 735         r600_init_command_buffer(cb, 256);
 736         cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
 737
 738         /* This must be first. */
 739         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 740         r600_store_value(cb, 0x80000000);
 741         r600_store_value(cb, 0x80000000);
 742
 743         /* We're setting config registers here. */
 744         r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 745         r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 746
 747         switch (ctx->b.family) {
 748         case CHIP_CEDAR:
 749         default:
 750                 num_threads = 128;
 751                 num_stack_entries = 256;
 752                 break;
 753         case CHIP_REDWOOD:
 754                 num_threads = 128;
 755                 num_stack_entries = 256;
 756                 break;
 757         case CHIP_JUNIPER:
 758                 num_threads = 128;
 759                 num_stack_entries = 512;
 760                 break;
 761         case CHIP_CYPRESS:
 762         case CHIP_HEMLOCK:
 763                 num_threads = 128;
 764                 num_stack_entries = 512;
 765                 break;
 766         case CHIP_PALM:
 767                 num_threads = 128;
 768                 num_stack_entries = 256;
 769                 break;
 770         case CHIP_SUMO:
 771                 num_threads = 128;
 772                 num_stack_entries = 256;
 773                 break;
 774         case CHIP_SUMO2:
 775                 num_threads = 128;
 776                 num_stack_entries = 512;
 777                 break;
 778         case CHIP_BARTS:
 779                 num_threads = 128;
 780                 num_stack_entries = 512;
 781                 break;
 782         case CHIP_TURKS:
 783                 num_threads = 128;
 784                 num_stack_entries = 256;
 785                 break;
 786         case CHIP_CAICOS:
 787                 num_threads = 128;
 788                 num_stack_entries = 256;
 789                 break;
 790         }
 791
 792         /* Config Registers */
 793         if (ctx->b.chip_class < CAYMAN)
 794                 evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 795                                            ctx->screen->b.info.drm_minor);
 796         else
 797                 cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
 798                                         ctx->screen->b.info.drm_minor);
 799
 800         /* The primitive type always needs to be POINTLIST for compute. */
 801         r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 802                                                 V_008958_DI_PT_POINTLIST);
 803
 804         if (ctx->b.chip_class < CAYMAN) {
 805
 806                 /* These registers control which simds can be used by each stage.
 807                  * The default for these registers is 0xffffffff, which means
 808                  * all simds are available for each stage.  It's possible we may
 809                  * want to play around with these in the future, but for now
 810                  * the default value is fine.
 811                  *
 812                  * R_008E20_SQ_STATIC_THREAD_MGMT1
 813                  * R_008E24_SQ_STATIC_THREAD_MGMT2
 814                  * R_008E28_SQ_STATIC_THREAD_MGMT3
 815                  */
 816
 817                 /* XXX: We may need to adjust the thread and stack resouce
 818                  * values for 3D/compute interop */
 819
 820                 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
 821
 822                 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
 823                  * Set the number of threads used by the PS/VS/GS/ES stage to
 824                  * 0.
 825                  */
 826                 r600_store_value(cb, 0);
 827
 828                 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
 829                  * Set the number of threads used by the CS (aka LS) stage to
 830                  * the maximum number of threads and set the number of threads
 831                  * for the HS stage to 0. */
 832                 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
 833
 834                 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
 835                  * Set the Control Flow stack entries to 0 for PS/VS stages */
 836                 r600_store_value(cb, 0);
 837
 838                 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
 839                  * Set the Control Flow stack entries to 0 for GS/ES stages */
 840                 r600_store_value(cb, 0);
 841
 842                 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
 843                  * Set the Contol Flow stack entries to 0 for the HS stage, and
 844                  * set it to the maximum value for the CS (aka LS) stage. */
 845                 r600_store_value(cb,
 846                         S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
 847         }
 848         /* Give the compute shader all the available LDS space.
 849          * NOTE: This only sets the maximum number of dwords that a compute
 850          * shader can allocate.  When a shader is executed, we still need to
 851          * allocate the appropriate amount of LDS dwords using the
 852          * CM_R_0288E8_SQ_LDS_ALLOC register.
 853          */
 854         if (ctx->b.chip_class < CAYMAN) {
 855                 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 856                         S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 857         } else {
 858                 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
 859                         S_0286FC_NUM_PS_LDS(0) |
 860                         S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
 861         }
 862
 863         /* Context Registers */
 864
 865         if (ctx->b.chip_class < CAYMAN) {
 866                 /* workaround for hw issues with dyn gpr - must set all limits
 867                  * to 240 instead of 0, 0x1e == 240 / 8
 868                  */
 869                 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
 870                                 S_028838_PS_GPRS(0x1e) |
 871                                 S_028838_VS_GPRS(0x1e) |
 872                                 S_028838_GS_GPRS(0x1e) |
 873                                 S_028838_ES_GPRS(0x1e) |
 874                                 S_028838_HS_GPRS(0x1e) |
 875                                 S_028838_LS_GPRS(0x1e));
 876         }
 877
 878         /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
 879         r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
 880                 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
 881
 882         r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 883
 884         r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
 885                                                 S_0286E8_TID_IN_GROUP_ENA
 886                                                 | S_0286E8_TGID_ENA
 887                                                 | S_0286E8_DISABLE_INDEX_PACK)
 888                                                 ;
 889
 890         /* The LOOP_CONST registers are an optimizations for loops that allows
 891          * you to store the initial counter, increment value, and maximum
 892          * counter value in a register so that hardware can calculate the
 893          * correct number of iterations for the loop, so that you don't need
 894          * to have the loop counter in your shader code.  We don't currently use
 895          * this optimization, so we must keep track of the counter in the
 896          * shader and use a break instruction to exit loops.  However, the
 897          * hardware will still uses this register to determine when to exit a
 898          * loop, so we need to initialize the counter to 0, set the increment
 899          * value to 1 and the maximum counter value to the 4095 (0xfff) which
 900          * is the maximum value allowed.  This gives us a maximum of 4096
 901          * iterations for our loops, but hopefully our break instruction will
 902          * execute before some time before the 4096th iteration.
 903          */
 904         eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 905 }
 906
 907 void evergreen_init_compute_state_functions(struct r600_context *ctx)
 908 {
 909         ctx->b.b.create_compute_state = evergreen_create_compute_state;
 910         ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
 911         ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
 912 //       ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
 913         ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
 914         ctx->b.b.set_global_binding = evergreen_set_global_binding;
 915         ctx->b.b.launch_grid = evergreen_launch_grid;
 916
 917 }
 918
 919 struct pipe_resource *r600_compute_global_buffer_create(
 920         struct pipe_screen *screen,
 921         const struct pipe_resource *templ)
 922 {
 923         struct r600_resource_global* result = NULL;
 924         struct r600_screen* rscreen = NULL;
 925         int size_in_dw = 0;
 926
 927         assert(templ->target == PIPE_BUFFER);
 928         assert(templ->bind & PIPE_BIND_GLOBAL);
 929         assert(templ->array_size == 1 || templ->array_size == 0);
 930         assert(templ->depth0 == 1 || templ->depth0 == 0);
 931         assert(templ->height0 == 1 || templ->height0 == 0);
 932
 933         result = (struct r600_resource_global*)
 934         CALLOC(sizeof(struct r600_resource_global), 1);
 935         rscreen = (struct r600_screen*)screen;
 936
 937         COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
 938         COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
 939                         templ->array_size);
 940
 941         result->base.b.vtbl = &r600_global_buffer_vtbl;
 942         result->base.b.b.screen = screen;
 943         result->base.b.b = *templ;
 944         pipe_reference_init(&result->base.b.b.reference, 1);
 945
 946         size_in_dw = (templ->width0+3) / 4;
 947
 948         result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
 949
 950         if (result->chunk == NULL)
 951         {
 952                 free(result);
 953                 return NULL;
 954         }
 955
 956         return &result->base.b.b;
 957 }
 958
 959 void r600_compute_global_buffer_destroy(
 960         struct pipe_screen *screen,
 961         struct pipe_resource *res)
 962 {
 963         struct r600_resource_global* buffer = NULL;
 964         struct r600_screen* rscreen = NULL;
 965
 966         assert(res->target == PIPE_BUFFER);
 967         assert(res->bind & PIPE_BIND_GLOBAL);
 968
 969         buffer = (struct r600_resource_global*)res;
 970         rscreen = (struct r600_screen*)screen;
 971
 972         compute_memory_free(rscreen->global_pool, buffer->chunk->id);
 973
 974         buffer->chunk = NULL;
 975         free(res);
 976 }
 977
 978 void *r600_compute_global_transfer_map(
 979         struct pipe_context *ctx_,
 980         struct pipe_resource *resource,
 981         unsigned level,
 982         unsigned usage,
 983         const struct pipe_box *box,
 984         struct pipe_transfer **ptransfer)
 985 {
 986         struct r600_context *rctx = (struct r600_context*)ctx_;
 987         struct compute_memory_pool *pool = rctx->screen->global_pool;
 988         struct r600_resource_global* buffer =
 989                 (struct r600_resource_global*)resource;
 990
 991         struct compute_memory_item *item = buffer->chunk;
 992         struct pipe_resource *dst = NULL;
 993         unsigned offset = box->x;
 994
 995         if (is_item_in_pool(item)) {
 996                 compute_memory_demote_item(pool, item, ctx_);
 997         }
 998         else {
 999                 if (item->real_buffer == NULL) {
1000                         item->real_buffer =
1001                                         r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1002                 }
1003         }
1004
1005         dst = (struct pipe_resource*)item->real_buffer;
1006
1007         if (usage & PIPE_TRANSFER_READ)
1008                 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1009
1010         COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1011                         "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1012                         "width = %u, height = %u, depth = %u)\n", level, usage,
1013                         box->x, box->y, box->z, box->width, box->height,
1014                         box->depth);
1015         COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1016                 "%u (box.x)\n", item->id, box->x);
1017
1018
1019         assert(resource->target == PIPE_BUFFER);
1020         assert(resource->bind & PIPE_BIND_GLOBAL);
1021         assert(box->x >= 0);
1022         assert(box->y == 0);
1023         assert(box->z == 0);
1024
1025         ///TODO: do it better, mapping is not possible if the pool is too big
1026         return pipe_buffer_map_range(ctx_, dst,
1027                         offset, box->width, usage, ptransfer);
1028 }
1029
1030 void r600_compute_global_transfer_unmap(
1031         struct pipe_context *ctx_,
1032         struct pipe_transfer* transfer)
1033 {
1034         /* struct r600_resource_global are not real resources, they just map
1035          * to an offset within the compute memory pool.  The function
1036          * r600_compute_global_transfer_map() maps the memory pool
1037          * resource rather than the struct r600_resource_global passed to
1038          * it as an argument and then initalizes ptransfer->resource with
1039          * the memory pool resource (via pipe_buffer_map_range).
1040          * When transfer_unmap is called it uses the memory pool's
1041          * vtable which calls r600_buffer_transfer_map() rather than
1042          * this function.
1043          */
1044         assert (!"This function should not be called");
1045 }
1046
1047 void r600_compute_global_transfer_flush_region(
1048         struct pipe_context *ctx_,
1049         struct pipe_transfer *transfer,
1050         const struct pipe_box *box)
1051 {
1052         assert(0 && "TODO");
1053 }
1054
1055 void r600_compute_global_transfer_inline_write(
1056         struct pipe_context *pipe,
1057         struct pipe_resource *resource,
1058         unsigned level,
1059         unsigned usage,
1060         const struct pipe_box *box,
1061         const void *data,
1062         unsigned stride,
1063         unsigned layer_stride)
1064 {
1065         assert(0 && "TODO");
1066 }