src/gallium/drivers/nouveau/nvc0/nve4_compute.c

   1 /*
   2  * Copyright 2012 Nouveau Project
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  * Authors: Christoph Bumiller
  23  */
  24
  25 #include "nvc0/nvc0_context.h"
  26 #include "nvc0/nve4_compute.h"
  27
  28 #include "codegen/nv50_ir_driver.h"
  29
  30 #ifdef DEBUG
  31 static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
  32 #endif
  33
  34
  35 int
  36 nve4_screen_compute_setup(struct nvc0_screen *screen,
  37                           struct nouveau_pushbuf *push)
  38 {
  39    struct nouveau_device *dev = screen->base.device;
  40    struct nouveau_object *chan = screen->base.channel;
  41    int i;
  42    int ret;
  43    uint32_t obj_class;
  44    uint64_t address;
  45
  46    switch (dev->chipset & ~0xf) {
  47    case 0x100:
  48    case 0xf0:
  49       obj_class = NVF0_COMPUTE_CLASS; /* GK110 */
  50       break;
  51    case 0xe0:
  52       obj_class = NVE4_COMPUTE_CLASS; /* GK104 */
  53       break;
  54    case 0x110:
  55       obj_class = GM107_COMPUTE_CLASS;
  56       break;
  57    case 0x120:
  58       obj_class = GM200_COMPUTE_CLASS;
  59       break;
  60    default:
  61       NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
  62       return -1;
  63    }
  64
  65    ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0,
  66                             &screen->compute);
  67    if (ret) {
  68       NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret);
  69       return ret;
  70    }
  71
  72    ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
  73                         &screen->parm);
  74    if (ret)
  75       return ret;
  76
  77    BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
  78    PUSH_DATA (push, screen->compute->oclass);
  79
  80    BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2);
  81    PUSH_DATAh(push, screen->tls->offset);
  82    PUSH_DATA (push, screen->tls->offset);
  83    /* No idea why there are 2. Divide size by 2 to be safe.
  84     * Actually this might be per-MP TEMP size and looks like I'm only using
  85     * 2 MPs instead of all 8.
  86     */
  87    BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3);
  88    PUSH_DATAh(push, screen->tls->size / screen->mp_count);
  89    PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
  90    PUSH_DATA (push, 0xff);
  91    BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3);
  92    PUSH_DATAh(push, screen->tls->size / screen->mp_count);
  93    PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
  94    PUSH_DATA (push, 0xff);
  95
  96    /* Unified address space ? Who needs that ? Certainly not OpenCL.
  97     *
  98     * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
  99     *  accessible. We cannot prevent that at the moment, so expect failure.
 100     */
 101    BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
 102    PUSH_DATA (push, 0xff << 24);
 103    BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
 104    PUSH_DATA (push, 0xfe << 24);
 105
 106    BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
 107    PUSH_DATAh(push, screen->text->offset);
 108    PUSH_DATA (push, screen->text->offset);
 109
 110    BEGIN_NVC0(push, SUBC_CP(0x0310), 1);
 111    PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
 112
 113    /* NOTE: these do not affect the state used by the 3D object */
 114    BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3);
 115    PUSH_DATAh(push, screen->txc->offset);
 116    PUSH_DATA (push, screen->txc->offset);
 117    PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
 118    BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3);
 119    PUSH_DATAh(push, screen->txc->offset + 65536);
 120    PUSH_DATA (push, screen->txc->offset + 65536);
 121    PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
 122
 123    if (obj_class >= NVF0_COMPUTE_CLASS) {
 124       /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1)
 125        * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently
 126        * disabled because our firmware doesn't support these commands and the
 127        * GPU hangs if they are used. */
 128       BEGIN_NIC0(push, SUBC_CP(0x0248), 64);
 129       for (i = 63; i >= 0; i--)
 130          PUSH_DATA(push, 0x38000 | i);
 131       IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
 132    }
 133
 134    BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
 135    PUSH_DATA (push, 7); /* does not interfere with 3D */
 136
 137    /* Disabling this UNK command avoid a read fault when using texelFetch()
 138     * from a compute shader for weird reasons.
 139    if (obj_class == NVF0_COMPUTE_CLASS)
 140       IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
 141    */
 142
 143    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 144
 145    /* MS sample coordinate offsets: these do not work with _ALT modes ! */
 146    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 147    PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
 148    PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
 149    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 150    PUSH_DATA (push, 64);
 151    PUSH_DATA (push, 1);
 152    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17);
 153    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 154    PUSH_DATA (push, 0); /* 0 */
 155    PUSH_DATA (push, 0);
 156    PUSH_DATA (push, 1); /* 1 */
 157    PUSH_DATA (push, 0);
 158    PUSH_DATA (push, 0); /* 2 */
 159    PUSH_DATA (push, 1);
 160    PUSH_DATA (push, 1); /* 3 */
 161    PUSH_DATA (push, 1);
 162    PUSH_DATA (push, 2); /* 4 */
 163    PUSH_DATA (push, 0);
 164    PUSH_DATA (push, 3); /* 5 */
 165    PUSH_DATA (push, 0);
 166    PUSH_DATA (push, 2); /* 6 */
 167    PUSH_DATA (push, 1);
 168    PUSH_DATA (push, 3); /* 7 */
 169    PUSH_DATA (push, 1);
 170
 171 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
 172    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 173    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
 174    PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
 175    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 176    PUSH_DATA (push, 28);
 177    PUSH_DATA (push, 1);
 178    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8);
 179    PUSH_DATA (push, 1);
 180    PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
 181    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
 182    PUSH_DATA (push, screen->tls->offset);
 183    PUSH_DATAh(push, screen->tls->offset);
 184    PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */
 185    PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */
 186    PUSH_DATA (push, 0); /* warp cfstack size */
 187 #endif
 188
 189    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 190    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 191
 192    return 0;
 193 }
 194
 195
 196 static void
 197 nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
 198 {
 199    struct nvc0_screen *screen = nvc0->screen;
 200    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 201    struct nv50_surface *sf;
 202    struct nv04_resource *res;
 203    uint32_t mask;
 204    unsigned i;
 205    const unsigned t = 1;
 206    uint64_t address;
 207
 208    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 209
 210    mask = nvc0->surfaces_dirty[t];
 211    while (mask) {
 212       i = ffs(mask) - 1;
 213       mask &= ~(1 << i);
 214
 215       /*
 216        * NVE4's surface load/store instructions receive all the information
 217        * directly instead of via binding points, so we have to supply them.
 218        */
 219       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 220       PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(i));
 221       PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(i));
 222       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 223       PUSH_DATA (push, 64);
 224       PUSH_DATA (push, 1);
 225       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17);
 226       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 227
 228       nve4_set_surface_info(push, nvc0->surfaces[t][i], screen);
 229
 230       sf = nv50_surface(nvc0->surfaces[t][i]);
 231       if (sf) {
 232          res = nv04_resource(sf->base.texture);
 233
 234          if (sf->base.writable)
 235             BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
 236          else
 237             BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
 238       }
 239    }
 240    if (nvc0->surfaces_dirty[t]) {
 241       BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 242       PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 243    }
 244
 245    /* re-reference non-dirty surfaces */
 246    mask = nvc0->surfaces_valid[t] & ~nvc0->surfaces_dirty[t];
 247    while (mask) {
 248       i = ffs(mask) - 1;
 249       mask &= ~(1 << i);
 250
 251       sf = nv50_surface(nvc0->surfaces[t][i]);
 252       res = nv04_resource(sf->base.texture);
 253
 254       if (sf->base.writable)
 255          BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
 256       else
 257          BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
 258    }
 259
 260    nvc0->surfaces_dirty[t] = 0;
 261 }
 262
 263
 264 /* Thankfully, textures with samplers follow the normal rules. */
 265 static void
 266 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
 267 {
 268    bool need_flush = nve4_validate_tsc(nvc0, 5);
 269    if (need_flush) {
 270       BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1);
 271       PUSH_DATA (nvc0->base.pushbuf, 0);
 272    }
 273 }
 274 /* (Code duplicated at bottom for various non-convincing reasons.
 275  *  E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
 276  *  entries to avoid a subchannel switch.
 277  *  Same for texture cache flushes.
 278  *  Also, the bufctx differs, and more IFs in the 3D version looks ugly.)
 279  */
 280 static void nve4_compute_validate_textures(struct nvc0_context *);
 281
 282 static void
 283 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 284 {
 285    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 286    struct nvc0_screen *screen = nvc0->screen;
 287    uint64_t address;
 288    const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
 289    unsigned i, n;
 290    uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
 291
 292    if (!dirty)
 293       return;
 294    i = ffs(dirty) - 1;
 295    n = util_logbase2(dirty) + 1 - i;
 296    assert(n);
 297
 298    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
 299
 300    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 301    PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
 302    PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
 303    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 304    PUSH_DATA (push, n * 4);
 305    PUSH_DATA (push, 0x1);
 306    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n);
 307    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 308    PUSH_DATAp(push, &nvc0->tex_handles[s][i], n);
 309
 310    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 311    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 312
 313    nvc0->textures_dirty[s] = 0;
 314    nvc0->samplers_dirty[s] = 0;
 315 }
 316
 317 static void
 318 nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
 319 {
 320    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 321    const int s = 5;
 322
 323    while (nvc0->constbuf_dirty[s]) {
 324       int i = ffs(nvc0->constbuf_dirty[s]) - 1;
 325       nvc0->constbuf_dirty[s] &= ~(1 << i);
 326
 327       if (nvc0->constbuf[s][i].user) {
 328          struct nouveau_bo *bo = nvc0->screen->uniform_bo;
 329          const unsigned base = NVC0_CB_USR_INFO(s);
 330          const unsigned size = nvc0->constbuf[s][0].size;
 331          assert(i == 0); /* we really only want OpenGL uniforms here */
 332          assert(nvc0->constbuf[s][0].u.data);
 333
 334          BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 335          PUSH_DATAh(push, bo->offset + base);
 336          PUSH_DATA (push, bo->offset + base);
 337          BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 338          PUSH_DATA (push, size);
 339          PUSH_DATA (push, 0x1);
 340          BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
 341          PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 342          PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
 343       }
 344       else {
 345          struct nv04_resource *res =
 346             nv04_resource(nvc0->constbuf[s][i].u.buf);
 347          if (res) {
 348             uint64_t address
 349                = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
 350
 351             assert(i > 0); /* we really only want uniform buffer objects */
 352
 353             BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 354             PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
 355             PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
 356             BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 357             PUSH_DATA (push, 4 * 4);
 358             PUSH_DATA (push, 0x1);
 359             BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
 360             PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 361
 362             PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
 363             PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
 364             PUSH_DATA (push, nvc0->constbuf[5][i].size);
 365             PUSH_DATA (push, 0);
 366             BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
 367
 368             res->cb_bindings[s] |= 1 << i;
 369          }
 370       }
 371    }
 372
 373    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 374    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 375 }
 376
 377 static void
 378 nve4_compute_validate_buffers(struct nvc0_context *nvc0)
 379 {
 380    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 381    uint64_t address;
 382    const int s = 5;
 383    int i;
 384
 385    address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
 386
 387    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 388    PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
 389    PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
 390    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 391    PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
 392    PUSH_DATA (push, 0x1);
 393    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
 394    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 395
 396    for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
 397       if (nvc0->buffers[s][i].buffer) {
 398          struct nv04_resource *res =
 399             nv04_resource(nvc0->buffers[s][i].buffer);
 400          PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
 401          PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
 402          PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
 403          PUSH_DATA (push, 0);
 404          BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
 405       } else {
 406          PUSH_DATA (push, 0);
 407          PUSH_DATA (push, 0);
 408          PUSH_DATA (push, 0);
 409          PUSH_DATA (push, 0);
 410       }
 411    }
 412 }
 413
 414 static struct nvc0_state_validate
 415 validate_list_cp[] = {
 416    { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
 417    { nve4_compute_validate_textures,      NVC0_NEW_CP_TEXTURES    },
 418    { nve4_compute_validate_samplers,      NVC0_NEW_CP_SAMPLERS    },
 419    { nve4_compute_set_tex_handles,        NVC0_NEW_CP_TEXTURES |
 420                                           NVC0_NEW_CP_SAMPLERS    },
 421    { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
 422    { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
 423    { nve4_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
 424    { nve4_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
 425 };
 426
 427 static bool
 428 nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 429 {
 430    bool ret;
 431
 432    ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
 433                              ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
 434                              nvc0->bufctx_cp);
 435
 436    if (unlikely(nvc0->state.flushed))
 437       nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 438    return ret;
 439 }
 440
 441 static void
 442 nve4_compute_upload_input(struct nvc0_context *nvc0,
 443                           struct nve4_cp_launch_desc *desc,
 444                           const struct pipe_grid_info *info)
 445 {
 446    struct nvc0_screen *screen = nvc0->screen;
 447    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 448    struct nvc0_program *cp = nvc0->compprog;
 449    uint64_t address;
 450
 451    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 452
 453    if (cp->parm_size) {
 454       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 455       PUSH_DATAh(push, screen->parm->offset);
 456       PUSH_DATA (push, screen->parm->offset);
 457       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 458       PUSH_DATA (push, cp->parm_size);
 459       PUSH_DATA (push, 0x1);
 460       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
 461       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 462       PUSH_DATAp(push, info->input, cp->parm_size / 4);
 463
 464       /* Bind user parameters coming from clover. */
 465       /* TODO: This should be harmonized with uniform_bo. */
 466       assert(!(desc->cb_mask & (1 << 0)));
 467       nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
 468    }
 469    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 470    PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO);
 471    PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO);
 472    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 473    PUSH_DATA (push, 7 * 4);
 474    PUSH_DATA (push, 0x1);
 475
 476    if (unlikely(info->indirect)) {
 477       struct nv04_resource *res = nv04_resource(info->indirect);
 478       uint32_t offset = res->offset + info->indirect_offset;
 479
 480       nouveau_pushbuf_space(push, 16, 0, 1);
 481       PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
 482
 483       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
 484       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 485       PUSH_DATAp(push, info->block, 3);
 486       nouveau_pushbuf_data(push, res->bo, offset,
 487                            NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
 488    } else {
 489       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
 490       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 491       PUSH_DATAp(push, info->block, 3);
 492       PUSH_DATAp(push, info->grid, 3);
 493    }
 494    PUSH_DATA (push, 0);
 495
 496    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
 497    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 498 }
 499
 500 static inline uint8_t
 501 nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 502 {
 503    if (shared_size > (32 << 10))
 504       return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1;
 505    if (shared_size > (16 << 10))
 506       return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1;
 507    return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
 508 }
 509
 510 static void
 511 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
 512                                struct nve4_cp_launch_desc *desc,
 513                                const struct pipe_grid_info *info)
 514 {
 515    const struct nvc0_screen *screen = nvc0->screen;
 516    const struct nvc0_program *cp = nvc0->compprog;
 517
 518    nve4_cp_launch_desc_init_default(desc);
 519
 520    desc->entry = nvc0_program_symbol_offset(cp, info->pc);
 521
 522    desc->griddim_x = info->grid[0];
 523    desc->griddim_y = info->grid[1];
 524    desc->griddim_z = info->grid[2];
 525    desc->blockdim_x = info->block[0];
 526    desc->blockdim_y = info->block[1];
 527    desc->blockdim_z = info->block[2];
 528
 529    desc->shared_size = align(cp->cp.smem_size, 0x100);
 530    desc->local_size_p = align(cp->cp.lmem_size, 0x10);
 531    desc->local_size_n = 0;
 532    desc->cstack_size = 0x800;
 533    desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size);
 534
 535    desc->gpr_alloc = cp->num_gprs;
 536    desc->bar_alloc = cp->num_barriers;
 537
 538    // Only bind OpenGL uniforms and the driver constant buffer through the
 539    // launch descriptor because UBOs are sticked to the driver cb to avoid the
 540    // limitation of 8 CBs.
 541    if (nvc0->constbuf[5][0].user) {
 542       nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
 543                                  NVC0_CB_USR_INFO(5), 1 << 16);
 544    }
 545    nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
 546                               NVC0_CB_AUX_INFO(5), 1 << 11);
 547 }
 548
 549 static inline struct nve4_cp_launch_desc *
 550 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
 551                                struct nouveau_bo **pbo, uint64_t *pgpuaddr)
 552 {
 553    uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo);
 554    if (!ptr)
 555       return NULL;
 556    if (*pgpuaddr & 255) {
 557       unsigned adj = 256 - (*pgpuaddr & 255);
 558       ptr += adj;
 559       *pgpuaddr += adj;
 560    }
 561    return (struct nve4_cp_launch_desc *)ptr;
 562 }
 563
 564 void
 565 nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 566 {
 567    struct nvc0_context *nvc0 = nvc0_context(pipe);
 568    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 569    struct nve4_cp_launch_desc *desc;
 570    uint64_t desc_gpuaddr;
 571    struct nouveau_bo *desc_bo;
 572    int ret;
 573
 574    desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr);
 575    if (!desc) {
 576       ret = -1;
 577       goto out;
 578    }
 579    BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
 580                 desc_bo);
 581
 582    ret = !nve4_state_validate_cp(nvc0, ~0);
 583    if (ret)
 584       goto out;
 585
 586    nve4_compute_setup_launch_desc(nvc0, desc, info);
 587
 588    nve4_compute_upload_input(nvc0, desc, info);
 589
 590 #ifdef DEBUG
 591    if (debug_get_num_option("NV50_PROG_DEBUG", 0))
 592       nve4_compute_dump_launch_desc(desc);
 593 #endif
 594
 595    if (unlikely(info->indirect)) {
 596       struct nv04_resource *res = nv04_resource(info->indirect);
 597       uint32_t offset = res->offset + info->indirect_offset;
 598
 599       /* upload the descriptor */
 600       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 601       PUSH_DATAh(push, desc_gpuaddr);
 602       PUSH_DATA (push, desc_gpuaddr);
 603       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 604       PUSH_DATA (push, 256);
 605       PUSH_DATA (push, 1);
 606       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
 607       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
 608       PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
 609
 610       /* overwrite griddim_x and griddim_y as two 32-bits integers even
 611        * if griddim_y must be a 16-bits integer */
 612       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 613       PUSH_DATAh(push, desc_gpuaddr + 48);
 614       PUSH_DATA (push, desc_gpuaddr + 48);
 615       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 616       PUSH_DATA (push, 8);
 617       PUSH_DATA (push, 1);
 618
 619       nouveau_pushbuf_space(push, 16, 0, 1);
 620       PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
 621
 622       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
 623       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
 624       nouveau_pushbuf_data(push, res->bo, offset,
 625                            NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
 626
 627       /* overwrite the 16 high bits of griddim_y with griddim_z because
 628        * we need (z << 16) | x */
 629       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 630       PUSH_DATAh(push, desc_gpuaddr + 54);
 631       PUSH_DATA (push, desc_gpuaddr + 54);
 632       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 633       PUSH_DATA (push, 4);
 634       PUSH_DATA (push, 1);
 635       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
 636       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
 637       nouveau_pushbuf_data(push, res->bo, offset + 8,
 638                            NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
 639    }
 640
 641    /* upload descriptor and flush */
 642    BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
 643    PUSH_DATA (push, desc_gpuaddr >> 8);
 644    BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
 645    PUSH_DATA (push, 0x3);
 646    BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
 647    PUSH_DATA (push, 0);
 648
 649 out:
 650    if (ret)
 651       NOUVEAU_ERR("Failed to launch grid !\n");
 652    nouveau_scratch_done(&nvc0->base);
 653    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC);
 654 }
 655
 656
 657 #define NVE4_TIC_ENTRY_INVALID 0x000fffff
 658
 659 static void
 660 nve4_compute_validate_textures(struct nvc0_context *nvc0)
 661 {
 662    struct nouveau_bo *txc = nvc0->screen->txc;
 663    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 664    const unsigned s = 5;
 665    unsigned i;
 666    uint32_t commands[2][32];
 667    unsigned n[2] = { 0, 0 };
 668
 669    for (i = 0; i < nvc0->num_textures[s]; ++i) {
 670       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
 671       struct nv04_resource *res;
 672       const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 673
 674       if (!tic) {
 675          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
 676          continue;
 677       }
 678       res = nv04_resource(tic->pipe.texture);
 679
 680       if (tic->id < 0) {
 681          tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
 682
 683          PUSH_SPACE(push, 16);
 684          BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
 685          PUSH_DATAh(push, txc->offset + (tic->id * 32));
 686          PUSH_DATA (push, txc->offset + (tic->id * 32));
 687          BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
 688          PUSH_DATA (push, 32);
 689          PUSH_DATA (push, 1);
 690          BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
 691          PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
 692          PUSH_DATAp(push, &tic->tic[0], 8);
 693
 694          commands[0][n[0]++] = (tic->id << 4) | 1;
 695       } else
 696       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
 697          commands[1][n[1]++] = (tic->id << 4) | 1;
 698       }
 699       nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
 700
 701       res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
 702       res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
 703
 704       nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
 705       nvc0->tex_handles[s][i] |= tic->id;
 706       if (dirty)
 707          BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD);
 708    }
 709    for (; i < nvc0->state.num_textures[s]; ++i)
 710       nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
 711
 712    if (n[0]) {
 713       BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]);
 714       PUSH_DATAp(push, commands[0], n[0]);
 715    }
 716    if (n[1]) {
 717       BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]);
 718       PUSH_DATAp(push, commands[1], n[1]);
 719    }
 720
 721    nvc0->state.num_textures[s] = nvc0->num_textures[s];
 722 }
 723
 724
 725 #ifdef DEBUG
 726 static const char *nve4_cache_split_name(unsigned value)
 727 {
 728    switch (value) {
 729    case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1";
 730    case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1";
 731    case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1";
 732    default:
 733       return "(invalid)";
 734    }
 735 }
 736
 737 static void
 738 nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
 739 {
 740    const uint32_t *data = (const uint32_t *)desc;
 741    unsigned i;
 742    bool zero = false;
 743
 744    debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
 745
 746    for (i = 0; i < sizeof(*desc); i += 4) {
 747       if (data[i / 4]) {
 748          debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
 749          zero = false;
 750       } else
 751       if (!zero) {
 752          debug_printf("...\n");
 753          zero = true;
 754       }
 755    }
 756
 757    debug_printf("entry = 0x%x\n", desc->entry);
 758    debug_printf("grid dimensions = %ux%ux%u\n",
 759                 desc->griddim_x, desc->griddim_y, desc->griddim_z);
 760    debug_printf("block dimensions = %ux%ux%u\n",
 761                 desc->blockdim_x, desc->blockdim_y, desc->blockdim_z);
 762    debug_printf("s[] size: 0x%x\n", desc->shared_size);
 763    debug_printf("l[] size: -0x%x / +0x%x\n",
 764                 desc->local_size_n, desc->local_size_p);
 765    debug_printf("stack size: 0x%x\n", desc->cstack_size);
 766    debug_printf("barrier count: %u\n", desc->bar_alloc);
 767    debug_printf("$r count: %u\n", desc->gpr_alloc);
 768    debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split));
 769
 770    for (i = 0; i < 8; ++i) {
 771       uint64_t address;
 772       uint32_t size = desc->cb[i].size;
 773       bool valid = !!(desc->cb_mask & (1 << i));
 774
 775       address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
 776
 777       if (!valid && !address && !size)
 778          continue;
 779       debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n",
 780                    i, address, size, valid ? "" : "  (invalid)");
 781    }
 782 }
 783 #endif
 784
 785 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
 786 static void
 787 nve4_compute_trap_info(struct nvc0_context *nvc0)
 788 {
 789    struct nvc0_screen *screen = nvc0->screen;
 790    struct nouveau_bo *bo = screen->parm;
 791    int ret, i;
 792    volatile struct nve4_mp_trap_info *info;
 793    uint8_t *map;
 794
 795    ret = nouveau_bo_map(bo, NOUVEAU_BO_RDWR, nvc0->base.client);
 796    if (ret)
 797       return;
 798    map = (uint8_t *)bo->map;
 799    info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO);
 800
 801    if (info->lock) {
 802       debug_printf("trapstat = %08x\n", info->trapstat);
 803       debug_printf("warperr = %08x\n", info->warperr);
 804       debug_printf("PC = %x\n", info->pc);
 805       debug_printf("tid = %u %u %u\n",
 806                    info->tid[0], info->tid[1], info->tid[2]);
 807       debug_printf("ctaid = %u %u %u\n",
 808                    info->ctaid[0], info->ctaid[1], info->ctaid[2]);
 809       for (i = 0; i <= 63; ++i)
 810          debug_printf("$r%i = %08x\n", i, info->r[i]);
 811       for (i = 0; i <= 6; ++i)
 812          debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1);
 813       debug_printf("$c = %x\n", info->flags >> 12);
 814    }
 815    info->lock = 0;
 816 }
 817 #endif