X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fnouveau%2Fnvc0%2Fnve4_compute.c;h=e85e9b48b7f8ec1e88488627967374c71d04e6a7;hb=a227b0a4f1354f145ff49183b687dd7541a24c86;hp=fce02a7cc576a4ce68097e79620c269fe5f6928b;hpb=6ee082718fca884fbda73001e0ecb32095409549;p=mesa.git diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index fce02a7cc57..e85e9b48b7f 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -23,7 +23,6 @@ */ #include "nvc0/nvc0_context.h" -#include "nvc0/nvc0_compute.h" #include "nvc0/nve4_compute.h" #include "codegen/nv50_ir_driver.h" @@ -39,9 +38,10 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, { struct nouveau_device *dev = screen->base.device; struct nouveau_object *chan = screen->base.channel; - unsigned i; + int i; int ret; uint32_t obj_class; + uint64_t address; switch (dev->chipset & ~0xf) { case 0x100: @@ -51,6 +51,12 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, case 0xe0: obj_class = NVE4_COMPUTE_CLASS; /* GK104 */ break; + case 0x110: + obj_class = GM107_COMPUTE_CLASS; + break; + case 0x120: + obj_class = GM200_COMPUTE_CLASS; + break; default: NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); return -1; @@ -63,26 +69,21 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, return ret; } - ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL, - &screen->parm); - if (ret) - return ret; - - BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); + BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->oclass); - BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls->offset); PUSH_DATA (push, screen->tls->offset); /* No idea why there are 2. Divide size by 2 to be safe. * Actually this might be per-MP TEMP size and looks like I'm only using * 2 MPs instead of all 8. */ - BEGIN_NVC0(push, NVE4_COMPUTE(MP_TEMP_SIZE_HIGH(0)), 3); + BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3); PUSH_DATAh(push, screen->tls->size / screen->mp_count); PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); PUSH_DATA (push, 0xff); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_TEMP_SIZE_HIGH(1)), 3); + BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3); PUSH_DATAh(push, screen->tls->size / screen->mp_count); PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); PUSH_DATA (push, 0xff); @@ -92,52 +93,58 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be * accessible. We cannot prevent that at the moment, so expect failure. */ - BEGIN_NVC0(push, NVE4_COMPUTE(LOCAL_BASE), 1); - PUSH_DATA (push, 1 << 24); - BEGIN_NVC0(push, NVE4_COMPUTE(SHARED_BASE), 1); - PUSH_DATA (push, 2 << 24); + BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1); + PUSH_DATA (push, 0xff << 24); + BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1); + PUSH_DATA (push, 0xfe << 24); - BEGIN_NVC0(push, NVE4_COMPUTE(CODE_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->text->offset); PUSH_DATA (push, screen->text->offset); - BEGIN_NVC0(push, SUBC_COMPUTE(0x0310), 1); + BEGIN_NVC0(push, SUBC_CP(0x0310), 1); PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300); /* NOTE: these do not affect the state used by the 3D object */ - BEGIN_NVC0(push, NVE4_COMPUTE(TIC_ADDRESS_HIGH), 3); + BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset); PUSH_DATA (push, screen->txc->offset); PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1); - BEGIN_NVC0(push, NVE4_COMPUTE(TSC_ADDRESS_HIGH), 3); + BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset + 65536); PUSH_DATA (push, screen->txc->offset + 65536); PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1); if (obj_class >= NVF0_COMPUTE_CLASS) { - BEGIN_NVC0(push, SUBC_COMPUTE(0x0248), 1); - PUSH_DATA (push, 0x100); - BEGIN_NIC0(push, SUBC_COMPUTE(0x0248), 63); - for (i = 63; i >= 1; --i) + /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1) + * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently + * disabled because our firmware doesn't support these commands and the + * GPU hangs if they are used. */ + BEGIN_NIC0(push, SUBC_CP(0x0248), 64); + for (i = 63; i >= 0; i--) PUSH_DATA(push, 0x38000 | i); - IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); - IMMED_NVC0(push, SUBC_COMPUTE(0x518), 0); + IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0); } - BEGIN_NVC0(push, NVE4_COMPUTE(TEX_CB_INDEX), 1); - PUSH_DATA (push, 0); /* does not interefere with 3D */ + BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1); + PUSH_DATA (push, 7); /* does not interfere with 3D */ - if (obj_class >= NVF0_COMPUTE_CLASS) - IMMED_NVC0(push, SUBC_COMPUTE(0x02c4), 1); + /* Disabling this UNK command avoid a read fault when using texelFetch() + * from a compute shader for weird reasons. + if (obj_class == NVF0_COMPUTE_CLASS) + IMMED_NVC0(push, SUBC_CP(0x02c4), 1); + */ + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); /* MS sample coordinate offsets: these do not work with _ALT modes ! */ - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO); + PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 64); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); PUSH_DATA (push, 0); /* 0 */ PUSH_DATA (push, 0); @@ -156,14 +163,14 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATA (push, 3); /* 7 */ PUSH_DATA (push, 1); -#ifdef DEBUG - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); +#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 28); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 8); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8); PUSH_DATA (push, 1); PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); @@ -174,88 +181,136 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATA (push, 0); /* warp cfstack size */ #endif - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); return 0; } - static void -nve4_compute_validate_surfaces(struct nvc0_context *nvc0) +gm107_compute_validate_surfaces(struct nvc0_context *nvc0, + struct pipe_image_view *view, int slot) { - struct nvc0_screen *screen = nvc0->screen; + struct nv04_resource *res = nv04_resource(view->resource); struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct nv50_surface *sf; - struct nv04_resource *res; - uint32_t mask; - unsigned i; - const unsigned t = 1; - - mask = nvc0->surfaces_dirty[t]; - while (mask) { - i = ffs(mask) - 1; - mask &= ~(1 << i); - - /* - * NVE4's surface load/store instructions receive all the information - * directly instead of via binding points, so we have to supply them. - */ - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 64); + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_bo *txc = nvc0->screen->txc; + struct nv50_tic_entry *tic; + uint64_t address; + const int s = 5; + + tic = nv50_tic_entry(nvc0->images_tic[s][slot]); + + res = nv04_resource(tic->pipe.texture); + nvc0_update_tic(nvc0, tic, res); + + if (tic->id < 0) { + tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); + + /* upload the texture view */ + PUSH_SPACE(push, 16); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, txc->offset + (tic->id * 32)); + PUSH_DATA (push, txc->offset + (tic->id * 32)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 32); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, &tic->tic[0], 8); + + BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), 1); + PUSH_DATA (push, (tic->id << 4) | 1); + } else + if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { + BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), 1); + PUSH_DATA (push, (tic->id << 4) | 1); + } + nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); - nve4_set_surface_info(push, nvc0->surfaces[t][i], screen); + res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; + res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; - sf = nv50_surface(nvc0->surfaces[t][i]); - if (sf) { - res = nv04_resource(sf->base.texture); + BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); - if (sf->base.writable) - BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); - else - BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); - } - } - if (nvc0->surfaces_dirty[t]) { - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); - } + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); - /* re-reference non-dirty surfaces */ - mask = nvc0->surfaces_valid[t] & ~nvc0->surfaces_dirty[t]; - while (mask) { - i = ffs(mask) - 1; - mask &= ~(1 << i); + /* upload the texture handle */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); + PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 2); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATA (push, tic->id); + + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); +} + +static void +nve4_compute_validate_surfaces(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + uint64_t address; + const int s = 5; + int i, j; + + if (!nvc0->images_dirty[s]) + return; - sf = nv50_surface(nvc0->surfaces[t][i]); - res = nv04_resource(sf->base.texture); + address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); - if (sf->base.writable) + for (i = 0; i < NVC0_MAX_IMAGES; ++i) { + struct pipe_image_view *view = &nvc0->images[s][i]; + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_SU_INFO(i)); + PUSH_DATA (push, address + NVC0_CB_AUX_SU_INFO(i)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 16 * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 16); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + if (view->resource) { + struct nv04_resource *res = nv04_resource(view->resource); + + if (res->base.target == PIPE_BUFFER) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + nvc0_mark_image_range_valid(view); + } + + nve4_set_surface_info(push, view, nvc0); BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); - else - BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); - } - nvc0->surfaces_dirty[t] = 0; + if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) + gm107_compute_validate_surfaces(nvc0, view, i); + } else { + for (j = 0; j < 16; j++) + PUSH_DATA(push, 0); + } + } } - /* Thankfully, textures with samplers follow the normal rules. */ static void nve4_compute_validate_samplers(struct nvc0_context *nvc0) { - boolean need_flush = nve4_validate_tsc(nvc0, 5); + bool need_flush = nve4_validate_tsc(nvc0, 5); if (need_flush) { - BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1); + BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1); PUSH_DATA (nvc0->base.pushbuf, 0); } + + /* Invalidate all 3D samplers because they are aliased. */ + for (int s = 0; s < 5; s++) + nvc0->samplers_dirty[s] = ~0; + nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS; } + /* (Code duplicated at bottom for various non-convincing reasons. * E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC * entries to avoid a subchannel switch. @@ -268,6 +323,7 @@ static void nve4_compute_set_tex_handles(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; uint64_t address; const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE); unsigned i, n; @@ -279,92 +335,207 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0) n = util_logbase2(dirty) + 1 - i; assert(n); - address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i); + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, address); - PUSH_DATA (push, address); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i)); + PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, n * 4); PUSH_DATA (push, 0x1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + n); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); PUSH_DATAp(push, &nvc0->tex_handles[s][i], n); - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); nvc0->textures_dirty[s] = 0; nvc0->samplers_dirty[s] = 0; } +static void +nve4_compute_validate_constbufs(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const int s = 5; + + while (nvc0->constbuf_dirty[s]) { + int i = ffs(nvc0->constbuf_dirty[s]) - 1; + nvc0->constbuf_dirty[s] &= ~(1 << i); + + if (nvc0->constbuf[s][i].user) { + struct nouveau_bo *bo = nvc0->screen->uniform_bo; + const unsigned base = NVC0_CB_USR_INFO(s); + const unsigned size = nvc0->constbuf[s][0].size; + assert(i == 0); /* we really only want OpenGL uniforms here */ + assert(nvc0->constbuf[s][0].u.data); + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, bo->offset + base); + PUSH_DATA (push, bo->offset + base); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, size); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4); + } + else { + struct nv04_resource *res = + nv04_resource(nvc0->constbuf[s][i].u.buf); + if (res) { + uint64_t address + = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + assert(i > 0); /* we really only want uniform buffer objects */ + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); + PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4 * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset); + PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset); + PUSH_DATA (push, nvc0->constbuf[5][i].size); + PUSH_DATA (push, 0); + BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD); + + res->cb_bindings[s] |= 1 << i; + } + } + } + + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); +} -static boolean -nve4_compute_state_validate(struct nvc0_context *nvc0) +static void +nve4_compute_validate_buffers(struct nvc0_context *nvc0) { - if (!nvc0_compute_validate_program(nvc0)) - return FALSE; - if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES) - nve4_compute_validate_textures(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS) - nve4_compute_validate_samplers(nvc0); - if (nvc0->dirty_cp & (NVC0_NEW_CP_TEXTURES | NVC0_NEW_CP_SAMPLERS)) - nve4_compute_set_tex_handles(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SURFACES) - nve4_compute_validate_surfaces(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS) - nvc0_validate_global_residents(nvc0, - nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL); - - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE); - - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); - if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return FALSE; - if (unlikely(nvc0->state.flushed)) - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE); + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + uint64_t address; + const int s = 5; + int i; + + address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0)); + PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); - return TRUE; + for (i = 0; i < NVC0_MAX_BUFFERS; i++) { + if (nvc0->buffers[s][i].buffer) { + struct nv04_resource *res = + nv04_resource(nvc0->buffers[s][i].buffer); + PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); + PUSH_DATA (push, 0); + BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); + util_range_add(&res->valid_buffer_range, + nvc0->buffers[s][i].buffer_offset, + nvc0->buffers[s][i].buffer_size); + } else { + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + } + } } +static struct nvc0_state_validate +validate_list_cp[] = { + { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, + { nve4_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, + { nve4_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, + { nve4_compute_set_tex_handles, NVC0_NEW_CP_TEXTURES | + NVC0_NEW_CP_SAMPLERS }, + { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, + { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, + { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, + { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, +}; + +static bool +nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) +{ + bool ret; + + ret = nvc0_state_validate(nvc0, mask, validate_list_cp, + ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, + nvc0->bufctx_cp); + + if (unlikely(nvc0->state.flushed)) + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); + return ret; +} static void -nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input, - const uint *block_layout, - const uint *grid_layout) +nve4_compute_upload_input(struct nvc0_context *nvc0, + const struct pipe_grid_info *info) { struct nvc0_screen *screen = nvc0->screen; struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_program *cp = nvc0->compprog; + uint64_t address; + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); if (cp->parm_size) { - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset); - PUSH_DATA (push, screen->parm->offset); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, cp->parm_size); PUSH_DATA (push, 0x1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (cp->parm_size / 4)); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4)); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); - PUSH_DATAp(push, input, cp->parm_size / 4); + PUSH_DATAp(push, info->input, cp->parm_size / 4); } - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0)); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0)); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 7 * 4); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO(0)); + PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO(0)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 8 * 4); PUSH_DATA (push, 0x1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + 7); - PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); - PUSH_DATAp(push, block_layout, 3); - PUSH_DATAp(push, grid_layout, 3); + + if (unlikely(info->indirect)) { + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + + nouveau_pushbuf_space(push, 16, 0, 1); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, info->block, 3); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); + } else { + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, info->block, 3); + PUSH_DATAp(push, info->grid, 3); + } PUSH_DATA (push, 0); + PUSH_DATA (push, info->work_dim); - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); } -static INLINE uint8_t +static inline uint8_t nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) { if (shared_size > (32 << 10)) @@ -377,27 +548,24 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) static void nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, struct nve4_cp_launch_desc *desc, - uint32_t label, - const uint *block_layout, - const uint *grid_layout) + const struct pipe_grid_info *info) { const struct nvc0_screen *screen = nvc0->screen; const struct nvc0_program *cp = nvc0->compprog; - unsigned i; nve4_cp_launch_desc_init_default(desc); - desc->entry = nvc0_program_symbol_offset(cp, label); + desc->entry = nvc0_program_symbol_offset(cp, info->pc); - desc->griddim_x = grid_layout[0]; - desc->griddim_y = grid_layout[1]; - desc->griddim_z = grid_layout[2]; - desc->blockdim_x = block_layout[0]; - desc->blockdim_y = block_layout[1]; - desc->blockdim_z = block_layout[2]; + desc->griddim_x = info->grid[0]; + desc->griddim_y = info->grid[1]; + desc->griddim_z = info->grid[2]; + desc->blockdim_x = info->block[0]; + desc->blockdim_y = info->block[1]; + desc->blockdim_z = info->block[2]; desc->shared_size = align(cp->cp.smem_size, 0x100); - desc->local_size_p = align(cp->cp.lmem_size, 0x10); + desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10); desc->local_size_n = 0; desc->cstack_size = 0x800; desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size); @@ -405,15 +573,18 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, desc->gpr_alloc = cp->num_gprs; desc->bar_alloc = cp->num_barriers; - for (i = 0; i < 7; ++i) { - const unsigned s = 5; - if (nvc0->constbuf[s][i].u.buf) - nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]); + // Only bind user uniforms and the driver constant buffer through the + // launch descriptor because UBOs are sticked to the driver cb to avoid the + // limitation of 8 CBs. + if (nvc0->constbuf[5][0].user || cp->parm_size) { + nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo, + NVC0_CB_USR_INFO(5), 1 << 16); } - nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE); + nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo, + NVC0_CB_AUX_INFO(5), 1 << 11); } -static INLINE struct nve4_cp_launch_desc * +static inline struct nve4_cp_launch_desc * nve4_compute_alloc_launch_desc(struct nouveau_context *nv, struct nouveau_bo **pbo, uint64_t *pgpuaddr) { @@ -429,10 +600,7 @@ nve4_compute_alloc_launch_desc(struct nouveau_context *nv, } void -nve4_launch_grid(struct pipe_context *pipe, - const uint *block_layout, const uint *grid_layout, - uint32_t label, - const void *input) +nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -449,37 +617,71 @@ nve4_launch_grid(struct pipe_context *pipe, BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD, desc_bo); - ret = !nve4_compute_state_validate(nvc0); + ret = !nve4_state_validate_cp(nvc0, ~0); if (ret) goto out; - nve4_compute_setup_launch_desc(nvc0, desc, label, block_layout, grid_layout); + nve4_compute_setup_launch_desc(nvc0, desc, info); + + nve4_compute_upload_input(nvc0, info); + #ifdef DEBUG if (debug_get_num_option("NV50_PROG_DEBUG", 0)) nve4_compute_dump_launch_desc(desc); #endif - nve4_compute_upload_input(nvc0, input, block_layout, grid_layout); + if (unlikely(info->indirect)) { + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + + /* upload the descriptor */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr); + PUSH_DATA (push, desc_gpuaddr); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 256); + PUSH_DATA (push, 1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); + + /* overwrite griddim_x and griddim_y as two 32-bits integers even + * if griddim_y must be a 16-bits integer */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr + 48); + PUSH_DATA (push, desc_gpuaddr + 48); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 8); + PUSH_DATA (push, 1); + + nouveau_pushbuf_space(push, 16, 0, 1); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4); + + /* overwrite the 16 high bits of griddim_y with griddim_z because + * we need (z << 16) | x */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr + 54); + PUSH_DATA (push, desc_gpuaddr + 54); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4); + PUSH_DATA (push, 1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + nouveau_pushbuf_data(push, res->bo, offset + 8, + NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4); + } /* upload descriptor and flush */ -#if 0 - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, desc_gpuaddr); - PUSH_DATA (push, desc_gpuaddr); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 256); - PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (256 / 4)); - PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); - PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); - BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE); -#endif - BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH_DESC_ADDRESS), 1); + BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1); PUSH_DATA (push, desc_gpuaddr >> 8); - BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH), 1); + BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1); PUSH_DATA (push, 0x3); - BEGIN_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); + BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); out: @@ -499,31 +701,32 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) struct nouveau_pushbuf *push = nvc0->base.pushbuf; const unsigned s = 5; unsigned i; - uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX]; + uint32_t commands[2][32]; unsigned n[2] = { 0, 0 }; for (i = 0; i < nvc0->num_textures[s]; ++i) { struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); struct nv04_resource *res; - const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i)); + const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i)); if (!tic) { nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; continue; } res = nv04_resource(tic->pipe.texture); + nvc0_update_tic(nvc0, tic, res); if (tic->id < 0) { tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); PUSH_SPACE(push, 16); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, txc->offset + (tic->id * 32)); PUSH_DATA (push, txc->offset + (tic->id * 32)); - BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 32); PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 9); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); PUSH_DATAp(push, &tic->tic[0], 8); @@ -542,19 +745,29 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) if (dirty) BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD); } - for (; i < nvc0->state.num_textures[s]; ++i) + for (; i < nvc0->state.num_textures[s]; ++i) { nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; + nvc0->textures_dirty[s] |= 1 << i; + } if (n[0]) { - BEGIN_NIC0(push, NVE4_COMPUTE(TIC_FLUSH), n[0]); + BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]); PUSH_DATAp(push, commands[0], n[0]); } if (n[1]) { - BEGIN_NIC0(push, NVE4_COMPUTE(TEX_CACHE_CTL), n[1]); + BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]); PUSH_DATAp(push, commands[1], n[1]); } nvc0->state.num_textures[s] = nvc0->num_textures[s]; + + /* Invalidate all 3D textures because they are aliased. */ + for (int s = 0; s < 5; s++) { + for (int i = 0; i < nvc0->num_textures[s]; i++) + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); + nvc0->textures_dirty[s] = ~0; + } + nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; } @@ -575,18 +788,18 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) { const uint32_t *data = (const uint32_t *)desc; unsigned i; - boolean zero = FALSE; + bool zero = false; debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n"); for (i = 0; i < sizeof(*desc); i += 4) { if (data[i / 4]) { debug_printf("[%x]: 0x%08x\n", i, data[i / 4]); - zero = FALSE; + zero = false; } else if (!zero) { debug_printf("...\n"); - zero = TRUE; + zero = true; } } @@ -606,7 +819,7 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) for (i = 0; i < 8; ++i) { uint64_t address; uint32_t size = desc->cb[i].size; - boolean valid = !!(desc->cb_mask & (1 << i)); + bool valid = !!(desc->cb_mask & (1 << i)); address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;