src/gallium/drivers/nouveau/nvc0/nve4_compute.c

   1 /*
   2  * Copyright 2012 Nouveau Project
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  * Authors: Christoph Bumiller
  23  */
  24
  25 #include "nvc0/nvc0_context.h"
  26 #include "nvc0/nve4_compute.h"
  27
  28 #include "codegen/nv50_ir_driver.h"
  29
  30 #ifdef DEBUG
  31 static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
  32 #endif
  33
  34
  35 int
  36 nve4_screen_compute_setup(struct nvc0_screen *screen,
  37                           struct nouveau_pushbuf *push)
  38 {
  39    struct nouveau_device *dev = screen->base.device;
  40    struct nouveau_object *chan = screen->base.channel;
  41    int i;
  42    int ret;
  43    uint32_t obj_class;
  44    uint64_t address;
  45
  46    switch (dev->chipset & ~0xf) {
  47    case 0x100:
  48    case 0xf0:
  49       obj_class = NVF0_COMPUTE_CLASS; /* GK110 */
  50       break;
  51    case 0xe0:
  52       obj_class = NVE4_COMPUTE_CLASS; /* GK104 */
  53       break;
  54    case 0x110:
  55       obj_class = GM107_COMPUTE_CLASS;
  56       break;
  57    default:
  58       NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
  59       return -1;
  60    }
  61
  62    ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0,
  63                             &screen->compute);
  64    if (ret) {
  65       NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret);
  66       return ret;
  67    }
  68
  69    ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
  70                         &screen->parm);
  71    if (ret)
  72       return ret;
  73
  74    BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
  75    PUSH_DATA (push, screen->compute->oclass);
  76
  77    BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2);
  78    PUSH_DATAh(push, screen->tls->offset);
  79    PUSH_DATA (push, screen->tls->offset);
  80    /* No idea why there are 2. Divide size by 2 to be safe.
  81     * Actually this might be per-MP TEMP size and looks like I'm only using
  82     * 2 MPs instead of all 8.
  83     */
  84    BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3);
  85    PUSH_DATAh(push, screen->tls->size / screen->mp_count);
  86    PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
  87    PUSH_DATA (push, 0xff);
  88    BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3);
  89    PUSH_DATAh(push, screen->tls->size / screen->mp_count);
  90    PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
  91    PUSH_DATA (push, 0xff);
  92
  93    /* Unified address space ? Who needs that ? Certainly not OpenCL.
  94     *
  95     * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
  96     *  accessible. We cannot prevent that at the moment, so expect failure.
  97     */
  98    BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
  99    PUSH_DATA (push, 1 << 24);
 100    BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
 101    PUSH_DATA (push, 2 << 24);
 102
 103    BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
 104    PUSH_DATAh(push, screen->text->offset);
 105    PUSH_DATA (push, screen->text->offset);
 106
 107    BEGIN_NVC0(push, SUBC_CP(0x0310), 1);
 108    PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
 109
 110    /* NOTE: these do not affect the state used by the 3D object */
 111    BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3);
 112    PUSH_DATAh(push, screen->txc->offset);
 113    PUSH_DATA (push, screen->txc->offset);
 114    PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
 115    BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3);
 116    PUSH_DATAh(push, screen->txc->offset + 65536);
 117    PUSH_DATA (push, screen->txc->offset + 65536);
 118    PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
 119
 120    if (obj_class >= NVF0_COMPUTE_CLASS) {
 121       /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1)
 122        * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently
 123        * disabled because our firmware doesn't support these commands and the
 124        * GPU hangs if they are used. */
 125       BEGIN_NIC0(push, SUBC_CP(0x0248), 64);
 126       for (i = 63; i >= 0; i--)
 127          PUSH_DATA(push, 0x38000 | i);
 128       IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
 129    }
 130
 131    BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
 132    PUSH_DATA (push, 7); /* does not interfere with 3D */
 133
 134    if (obj_class == NVF0_COMPUTE_CLASS)
 135       IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
 136
 137    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 138
 139    /* MS sample coordinate offsets: these do not work with _ALT modes ! */
 140    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 141    PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
 142    PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
 143    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 144    PUSH_DATA (push, 64);
 145    PUSH_DATA (push, 1);
 146    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17);
 147    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 148    PUSH_DATA (push, 0); /* 0 */
 149    PUSH_DATA (push, 0);
 150    PUSH_DATA (push, 1); /* 1 */
 151    PUSH_DATA (push, 0);
 152    PUSH_DATA (push, 0); /* 2 */
 153    PUSH_DATA (push, 1);
 154    PUSH_DATA (push, 1); /* 3 */
 155    PUSH_DATA (push, 1);
 156    PUSH_DATA (push, 2); /* 4 */
 157    PUSH_DATA (push, 0);
 158    PUSH_DATA (push, 3); /* 5 */
 159    PUSH_DATA (push, 0);
 160    PUSH_DATA (push, 2); /* 6 */
 161    PUSH_DATA (push, 1);
 162    PUSH_DATA (push, 3); /* 7 */
 163    PUSH_DATA (push, 1);
 164
 165 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
 166    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 167    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
 168    PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
 169    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 170    PUSH_DATA (push, 28);
 171    PUSH_DATA (push, 1);
 172    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8);
 173    PUSH_DATA (push, 1);
 174    PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
 175    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
 176    PUSH_DATA (push, screen->tls->offset);
 177    PUSH_DATAh(push, screen->tls->offset);
 178    PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */
 179    PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */
 180    PUSH_DATA (push, 0); /* warp cfstack size */
 181 #endif
 182
 183    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 184    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 185
 186    return 0;
 187 }
 188
 189
 190 static void
 191 nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
 192 {
 193    struct nvc0_screen *screen = nvc0->screen;
 194    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 195    struct nv50_surface *sf;
 196    struct nv04_resource *res;
 197    uint32_t mask;
 198    unsigned i;
 199    const unsigned t = 1;
 200    uint64_t address;
 201
 202    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 203
 204    mask = nvc0->surfaces_dirty[t];
 205    while (mask) {
 206       i = ffs(mask) - 1;
 207       mask &= ~(1 << i);
 208
 209       /*
 210        * NVE4's surface load/store instructions receive all the information
 211        * directly instead of via binding points, so we have to supply them.
 212        */
 213       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 214       PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(i));
 215       PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(i));
 216       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 217       PUSH_DATA (push, 64);
 218       PUSH_DATA (push, 1);
 219       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17);
 220       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 221
 222       nve4_set_surface_info(push, nvc0->surfaces[t][i], screen);
 223
 224       sf = nv50_surface(nvc0->surfaces[t][i]);
 225       if (sf) {
 226          res = nv04_resource(sf->base.texture);
 227
 228          if (sf->base.writable)
 229             BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
 230          else
 231             BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
 232       }
 233    }
 234    if (nvc0->surfaces_dirty[t]) {
 235       BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 236       PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 237    }
 238
 239    /* re-reference non-dirty surfaces */
 240    mask = nvc0->surfaces_valid[t] & ~nvc0->surfaces_dirty[t];
 241    while (mask) {
 242       i = ffs(mask) - 1;
 243       mask &= ~(1 << i);
 244
 245       sf = nv50_surface(nvc0->surfaces[t][i]);
 246       res = nv04_resource(sf->base.texture);
 247
 248       if (sf->base.writable)
 249          BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
 250       else
 251          BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
 252    }
 253
 254    nvc0->surfaces_dirty[t] = 0;
 255 }
 256
 257
 258 /* Thankfully, textures with samplers follow the normal rules. */
 259 static void
 260 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
 261 {
 262    bool need_flush = nve4_validate_tsc(nvc0, 5);
 263    if (need_flush) {
 264       BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1);
 265       PUSH_DATA (nvc0->base.pushbuf, 0);
 266    }
 267 }
 268 /* (Code duplicated at bottom for various non-convincing reasons.
 269  *  E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
 270  *  entries to avoid a subchannel switch.
 271  *  Same for texture cache flushes.
 272  *  Also, the bufctx differs, and more IFs in the 3D version looks ugly.)
 273  */
 274 static void nve4_compute_validate_textures(struct nvc0_context *);
 275
 276 static void
 277 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 278 {
 279    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 280    struct nvc0_screen *screen = nvc0->screen;
 281    uint64_t address;
 282    const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
 283    unsigned i, n;
 284    uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
 285
 286    if (!dirty)
 287       return;
 288    i = ffs(dirty) - 1;
 289    n = util_logbase2(dirty) + 1 - i;
 290    assert(n);
 291
 292    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
 293
 294    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 295    PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
 296    PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
 297    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 298    PUSH_DATA (push, n * 4);
 299    PUSH_DATA (push, 0x1);
 300    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n);
 301    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 302    PUSH_DATAp(push, &nvc0->tex_handles[s][i], n);
 303
 304    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 305    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 306
 307    nvc0->textures_dirty[s] = 0;
 308    nvc0->samplers_dirty[s] = 0;
 309 }
 310
 311 static struct nvc0_state_validate
 312 validate_list_cp[] = {
 313    { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
 314    { nve4_compute_validate_textures,      NVC0_NEW_CP_TEXTURES    },
 315    { nve4_compute_validate_samplers,      NVC0_NEW_CP_SAMPLERS    },
 316    { nve4_compute_set_tex_handles,        NVC0_NEW_CP_TEXTURES |
 317                                           NVC0_NEW_CP_SAMPLERS    },
 318    { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
 319    { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
 320 };
 321
 322 static bool
 323 nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 324 {
 325    bool ret;
 326
 327    ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
 328                              ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
 329                              nvc0->bufctx_cp);
 330
 331    if (unlikely(nvc0->state.flushed))
 332       nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 333    return ret;
 334 }
 335
 336 static void
 337 nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
 338                           const uint *block_layout,
 339                           const uint *grid_layout)
 340 {
 341    struct nvc0_screen *screen = nvc0->screen;
 342    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 343    struct nvc0_program *cp = nvc0->compprog;
 344    uint64_t address;
 345
 346    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 347
 348    if (cp->parm_size) {
 349       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 350       PUSH_DATAh(push, screen->parm->offset);
 351       PUSH_DATA (push, screen->parm->offset);
 352       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 353       PUSH_DATA (push, cp->parm_size);
 354       PUSH_DATA (push, 0x1);
 355       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
 356       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 357       PUSH_DATAp(push, input, cp->parm_size / 4);
 358    }
 359    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 360    PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO);
 361    PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO);
 362    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 363    PUSH_DATA (push, 7 * 4);
 364    PUSH_DATA (push, 0x1);
 365    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
 366    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 367    PUSH_DATAp(push, block_layout, 3);
 368    PUSH_DATAp(push, grid_layout, 3);
 369    PUSH_DATA (push, 0);
 370
 371    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 372    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 373 }
 374
 375 static inline uint8_t
 376 nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 377 {
 378    if (shared_size > (32 << 10))
 379       return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1;
 380    if (shared_size > (16 << 10))
 381       return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1;
 382    return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
 383 }
 384
 385 static void
 386 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
 387                                struct nve4_cp_launch_desc *desc,
 388                                uint32_t label,
 389                                const uint *block_layout,
 390                                const uint *grid_layout)
 391 {
 392    const struct nvc0_screen *screen = nvc0->screen;
 393    const struct nvc0_program *cp = nvc0->compprog;
 394    unsigned i;
 395
 396    nve4_cp_launch_desc_init_default(desc);
 397
 398    desc->entry = nvc0_program_symbol_offset(cp, label);
 399
 400    desc->griddim_x = grid_layout[0];
 401    desc->griddim_y = grid_layout[1];
 402    desc->griddim_z = grid_layout[2];
 403    desc->blockdim_x = block_layout[0];
 404    desc->blockdim_y = block_layout[1];
 405    desc->blockdim_z = block_layout[2];
 406
 407    desc->shared_size = align(cp->cp.smem_size, 0x100);
 408    desc->local_size_p = align(cp->cp.lmem_size, 0x10);
 409    desc->local_size_n = 0;
 410    desc->cstack_size = 0x800;
 411    desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size);
 412
 413    desc->gpr_alloc = cp->num_gprs;
 414    desc->bar_alloc = cp->num_barriers;
 415
 416    for (i = 0; i < 7; ++i) {
 417       const unsigned s = 5;
 418       if (nvc0->constbuf[s][i].u.buf)
 419          nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
 420    }
 421    nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
 422    nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
 423                               NVC0_CB_AUX_INFO(5), 1 << 10);
 424 }
 425
 426 static inline struct nve4_cp_launch_desc *
 427 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
 428                                struct nouveau_bo **pbo, uint64_t *pgpuaddr)
 429 {
 430    uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo);
 431    if (!ptr)
 432       return NULL;
 433    if (*pgpuaddr & 255) {
 434       unsigned adj = 256 - (*pgpuaddr & 255);
 435       ptr += adj;
 436       *pgpuaddr += adj;
 437    }
 438    return (struct nve4_cp_launch_desc *)ptr;
 439 }
 440
 441 void
 442 nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 443 {
 444    struct nvc0_context *nvc0 = nvc0_context(pipe);
 445    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 446    struct nve4_cp_launch_desc *desc;
 447    uint64_t desc_gpuaddr;
 448    struct nouveau_bo *desc_bo;
 449    int ret;
 450
 451    desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr);
 452    if (!desc) {
 453       ret = -1;
 454       goto out;
 455    }
 456    BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
 457                 desc_bo);
 458
 459    ret = !nve4_state_validate_cp(nvc0, ~0);
 460    if (ret)
 461       goto out;
 462
 463    nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
 464                                   info->block, info->grid);
 465 #ifdef DEBUG
 466    if (debug_get_num_option("NV50_PROG_DEBUG", 0))
 467       nve4_compute_dump_launch_desc(desc);
 468 #endif
 469
 470    nve4_compute_upload_input(nvc0, info->input, info->block, info->grid);
 471
 472    /* upload descriptor and flush */
 473 #if 0
 474    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 475    PUSH_DATAh(push, desc_gpuaddr);
 476    PUSH_DATA (push, desc_gpuaddr);
 477    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 478    PUSH_DATA (push, 256);
 479    PUSH_DATA (push, 1);
 480    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
 481    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
 482    PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
 483    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 484    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
 485 #endif
 486    BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
 487    PUSH_DATA (push, desc_gpuaddr >> 8);
 488    BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
 489    PUSH_DATA (push, 0x3);
 490    BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
 491    PUSH_DATA (push, 0);
 492
 493 out:
 494    if (ret)
 495       NOUVEAU_ERR("Failed to launch grid !\n");
 496    nouveau_scratch_done(&nvc0->base);
 497    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC);
 498 }
 499
 500
 501 #define NVE4_TIC_ENTRY_INVALID 0x000fffff
 502
 503 static void
 504 nve4_compute_validate_textures(struct nvc0_context *nvc0)
 505 {
 506    struct nouveau_bo *txc = nvc0->screen->txc;
 507    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 508    const unsigned s = 5;
 509    unsigned i;
 510    uint32_t commands[2][32];
 511    unsigned n[2] = { 0, 0 };
 512
 513    for (i = 0; i < nvc0->num_textures[s]; ++i) {
 514       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
 515       struct nv04_resource *res;
 516       const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 517
 518       if (!tic) {
 519          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
 520          continue;
 521       }
 522       res = nv04_resource(tic->pipe.texture);
 523
 524       if (tic->id < 0) {
 525          tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
 526
 527          PUSH_SPACE(push, 16);
 528          BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 529          PUSH_DATAh(push, txc->offset + (tic->id * 32));
 530          PUSH_DATA (push, txc->offset + (tic->id * 32));
 531          BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 532          PUSH_DATA (push, 32);
 533          PUSH_DATA (push, 1);
 534          BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
 535          PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 536          PUSH_DATAp(push, &tic->tic[0], 8);
 537
 538          commands[0][n[0]++] = (tic->id << 4) | 1;
 539       } else
 540       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
 541          commands[1][n[1]++] = (tic->id << 4) | 1;
 542       }
 543       nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
 544
 545       res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
 546       res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
 547
 548       nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
 549       nvc0->tex_handles[s][i] |= tic->id;
 550       if (dirty)
 551          BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD);
 552    }
 553    for (; i < nvc0->state.num_textures[s]; ++i)
 554       nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
 555
 556    if (n[0]) {
 557       BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]);
 558       PUSH_DATAp(push, commands[0], n[0]);
 559    }
 560    if (n[1]) {
 561       BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]);
 562       PUSH_DATAp(push, commands[1], n[1]);
 563    }
 564
 565    nvc0->state.num_textures[s] = nvc0->num_textures[s];
 566 }
 567
 568
 569 #ifdef DEBUG
 570 static const char *nve4_cache_split_name(unsigned value)
 571 {
 572    switch (value) {
 573    case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1";
 574    case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1";
 575    case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1";
 576    default:
 577       return "(invalid)";
 578    }
 579 }
 580
 581 static void
 582 nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
 583 {
 584    const uint32_t *data = (const uint32_t *)desc;
 585    unsigned i;
 586    bool zero = false;
 587
 588    debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
 589
 590    for (i = 0; i < sizeof(*desc); i += 4) {
 591       if (data[i / 4]) {
 592          debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
 593          zero = false;
 594       } else
 595       if (!zero) {
 596          debug_printf("...\n");
 597          zero = true;
 598       }
 599    }
 600
 601    debug_printf("entry = 0x%x\n", desc->entry);
 602    debug_printf("grid dimensions = %ux%ux%u\n",
 603                 desc->griddim_x, desc->griddim_y, desc->griddim_z);
 604    debug_printf("block dimensions = %ux%ux%u\n",
 605                 desc->blockdim_x, desc->blockdim_y, desc->blockdim_z);
 606    debug_printf("s[] size: 0x%x\n", desc->shared_size);
 607    debug_printf("l[] size: -0x%x / +0x%x\n",
 608                 desc->local_size_n, desc->local_size_p);
 609    debug_printf("stack size: 0x%x\n", desc->cstack_size);
 610    debug_printf("barrier count: %u\n", desc->bar_alloc);
 611    debug_printf("$r count: %u\n", desc->gpr_alloc);
 612    debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split));
 613
 614    for (i = 0; i < 8; ++i) {
 615       uint64_t address;
 616       uint32_t size = desc->cb[i].size;
 617       bool valid = !!(desc->cb_mask & (1 << i));
 618
 619       address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
 620
 621       if (!valid && !address && !size)
 622          continue;
 623       debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n",
 624                    i, address, size, valid ? "" : "  (invalid)");
 625    }
 626 }
 627 #endif
 628
 629 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
 630 static void
 631 nve4_compute_trap_info(struct nvc0_context *nvc0)
 632 {
 633    struct nvc0_screen *screen = nvc0->screen;
 634    struct nouveau_bo *bo = screen->parm;
 635    int ret, i;
 636    volatile struct nve4_mp_trap_info *info;
 637    uint8_t *map;
 638
 639    ret = nouveau_bo_map(bo, NOUVEAU_BO_RDWR, nvc0->base.client);
 640    if (ret)
 641       return;
 642    map = (uint8_t *)bo->map;
 643    info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO);
 644
 645    if (info->lock) {
 646       debug_printf("trapstat = %08x\n", info->trapstat);
 647       debug_printf("warperr = %08x\n", info->warperr);
 648       debug_printf("PC = %x\n", info->pc);
 649       debug_printf("tid = %u %u %u\n",
 650                    info->tid[0], info->tid[1], info->tid[2]);
 651       debug_printf("ctaid = %u %u %u\n",
 652                    info->ctaid[0], info->ctaid[1], info->ctaid[2]);
 653       for (i = 0; i <= 63; ++i)
 654          debug_printf("$r%i = %08x\n", i, info->r[i]);
 655       for (i = 0; i <= 6; ++i)
 656          debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1);
 657       debug_printf("$c = %x\n", info->flags >> 12);
 658    }
 659    info->lock = 0;
 660 }
 661 #endif