X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fnouveau%2Fnvc0%2Fnvc0_compute.c;h=11635c94658b01b08c3621e9a78e9cbe098c5431;hb=48f04862c1d74844db9534b32ef73e5a2bc0ae74;hp=71804343138f7f4f8c4c5d9b79c7430b81d6acd6;hpb=e67f5cac796de5bce1081d723ade711a4f215f98;p=mesa.git diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index 71804343138..11635c94658 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -23,7 +23,8 @@ */ #include "nvc0/nvc0_context.h" -#include "nvc0/nvc0_compute.h" + +#include "nvc0/nvc0_compute.xml.h" int nvc0_screen_compute_setup(struct nvc0_screen *screen, @@ -54,205 +55,447 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, return ret; } - ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL, - &screen->parm); - if (ret) - return ret; - - BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); + BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->oclass); /* hardware limit */ - BEGIN_NVC0(push, NVC0_COMPUTE(MP_LIMIT), 1); + BEGIN_NVC0(push, NVC0_CP(MP_LIMIT), 1); PUSH_DATA (push, screen->mp_count); - BEGIN_NVC0(push, NVC0_COMPUTE(CALL_LIMIT_LOG), 1); + BEGIN_NVC0(push, NVC0_CP(CALL_LIMIT_LOG), 1); PUSH_DATA (push, 0xf); - BEGIN_NVC0(push, SUBC_COMPUTE(0x02a0), 1); + BEGIN_NVC0(push, SUBC_CP(0x02a0), 1); PUSH_DATA (push, 0x8000); /* global memory setup */ - BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); + BEGIN_NVC0(push, SUBC_CP(0x02c4), 1); PUSH_DATA (push, 0); - BEGIN_NIC0(push, NVC0_COMPUTE(GLOBAL_BASE), 0x100); + BEGIN_NIC0(push, NVC0_CP(GLOBAL_BASE), 0x100); for (i = 0; i <= 0xff; i++) PUSH_DATA (push, (0xc << 28) | (i << 16) | i); - BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); + BEGIN_NVC0(push, SUBC_CP(0x02c4), 1); PUSH_DATA (push, 1); /* local memory and cstack setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVC0_CP(TEMP_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls->offset); PUSH_DATA (push, screen->tls->offset); - BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_SIZE_HIGH), 2); + BEGIN_NVC0(push, NVC0_CP(TEMP_SIZE_HIGH), 2); PUSH_DATAh(push, screen->tls->size); PUSH_DATA (push, screen->tls->size); - BEGIN_NVC0(push, NVC0_COMPUTE(WARP_TEMP_ALLOC), 1); + BEGIN_NVC0(push, NVC0_CP(WARP_TEMP_ALLOC), 1); PUSH_DATA (push, 0); - BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_BASE), 1); - PUSH_DATA (push, 1 << 24); + BEGIN_NVC0(push, NVC0_CP(LOCAL_BASE), 1); + PUSH_DATA (push, 0xff << 24); /* shared memory setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(CACHE_SPLIT), 1); + BEGIN_NVC0(push, NVC0_CP(CACHE_SPLIT), 1); PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_48K_SHARED_16K_L1); - BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_BASE), 1); - PUSH_DATA (push, 2 << 24); - BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 1); + BEGIN_NVC0(push, NVC0_CP(SHARED_BASE), 1); + PUSH_DATA (push, 0xfe << 24); + BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 1); PUSH_DATA (push, 0); /* code segment setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(CODE_ADDRESS_HIGH), 2); + BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->text->offset); PUSH_DATA (push, screen->text->offset); - /* TODO: textures & samplers */ + /* textures */ + BEGIN_NVC0(push, NVC0_CP(TIC_ADDRESS_HIGH), 3); + PUSH_DATAh(push, screen->txc->offset); + PUSH_DATA (push, screen->txc->offset); + PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1); + + /* samplers */ + BEGIN_NVC0(push, NVC0_CP(TSC_ADDRESS_HIGH), 3); + PUSH_DATAh(push, screen->txc->offset + 65536); + PUSH_DATA (push, screen->txc->offset + 65536); + PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1); + + /* MS sample coordinate offsets */ + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 2 * 8); + PUSH_DATA (push, NVC0_CB_AUX_MS_INFO); + PUSH_DATA (push, 0); /* 0 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 1); /* 1 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); /* 2 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 1); /* 3 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 2); /* 4 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 3); /* 5 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 2); /* 6 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 3); /* 7 */ + PUSH_DATA (push, 1); return 0; } -bool -nvc0_compute_validate_program(struct nvc0_context *nvc0) +static void +nvc0_compute_validate_samplers(struct nvc0_context *nvc0) +{ + bool need_flush = nvc0_validate_tsc(nvc0, 5); + if (need_flush) { + BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TSC_FLUSH), 1); + PUSH_DATA (nvc0->base.pushbuf, 0); + } + + /* Invalidate all 3D samplers because they are aliased. */ + for (int s = 0; s < 5; s++) + nvc0->samplers_dirty[s] = ~0; + nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS; +} + +static void +nvc0_compute_validate_textures(struct nvc0_context *nvc0) { - struct nvc0_program *prog = nvc0->compprog; + bool need_flush = nvc0_validate_tic(nvc0, 5); + if (need_flush) { + BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TIC_FLUSH), 1); + PUSH_DATA (nvc0->base.pushbuf, 0); + } + + /* Invalidate all 3D textures because they are aliased. */ + for (int s = 0; s < 5; s++) { + for (int i = 0; i < nvc0->num_textures[s]; i++) + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); + nvc0->textures_dirty[s] = ~0; + } + nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; +} - if (prog->mem) - return true; +static inline void +nvc0_compute_invalidate_constbufs(struct nvc0_context *nvc0) +{ + int s; - if (!prog->translated) { - prog->translated = nvc0_program_translate( - prog, nvc0->screen->base.device->chipset, &nvc0->base.debug); - if (!prog->translated) - return false; + /* Invalidate all 3D constbufs because they are aliased with COMPUTE. */ + for (s = 0; s < 5; s++) { + nvc0->constbuf_dirty[s] |= nvc0->constbuf_valid[s]; + nvc0->state.uniform_buffer_bound[s] = 0; } - if (unlikely(!prog->code_size)) - return false; - - if (likely(prog->code_size)) { - if (nvc0_program_upload_code(nvc0, prog)) { - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE); - return true; + nvc0->dirty_3d |= NVC0_NEW_3D_CONSTBUF; +} + +static void +nvc0_compute_validate_constbufs(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const int s = 5; + + while (nvc0->constbuf_dirty[s]) { + int i = ffs(nvc0->constbuf_dirty[s]) - 1; + nvc0->constbuf_dirty[s] &= ~(1 << i); + + if (nvc0->constbuf[s][i].user) { + struct nouveau_bo *bo = nvc0->screen->uniform_bo; + const unsigned base = NVC0_CB_USR_INFO(s); + const unsigned size = nvc0->constbuf[s][0].size; + assert(i == 0); /* we really only want OpenGL uniforms here */ + assert(nvc0->constbuf[s][0].u.data); + + if (nvc0->state.uniform_buffer_bound[s] < size) { + nvc0->state.uniform_buffer_bound[s] = align(size, 0x100); + + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, nvc0->state.uniform_buffer_bound[s]); + PUSH_DATAh(push, bo->offset + base); + PUSH_DATA (push, bo->offset + base); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); + PUSH_DATA (push, (0 << 8) | 1); + } + nvc0_cb_bo_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base), + base, nvc0->state.uniform_buffer_bound[s], + 0, (size + 3) / 4, + nvc0->constbuf[s][0].u.data); + } else { + struct nv04_resource *res = + nv04_resource(nvc0->constbuf[s][i].u.buf); + if (res) { + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, nvc0->constbuf[s][i].size); + PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset); + PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); + PUSH_DATA (push, (i << 8) | 1); + + BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD); + + res->cb_bindings[s] |= 1 << i; + } else { + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); + PUSH_DATA (push, (i << 8) | 0); + } + if (i == 0) + nvc0->state.uniform_buffer_bound[s] = 0; } } - return false; + + nvc0_compute_invalidate_constbufs(nvc0); + + BEGIN_NVC0(push, NVC0_CP(FLUSH), 1); + PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB); } -static bool -nvc0_compute_state_validate(struct nvc0_context *nvc0) +static void +nvc0_compute_validate_driverconst(struct nvc0_context *nvc0) { - if (!nvc0_compute_validate_program(nvc0)) - return false; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; - /* TODO: textures, samplers, surfaces, global memory buffers */ + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); + PUSH_DATA (push, (15 << 8) | 1); - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false); + nvc0->dirty_3d |= NVC0_NEW_3D_DRIVERCONST; +} - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); - if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return false; - if (unlikely(nvc0->state.flushed)) - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); +static void +nvc0_compute_validate_buffers(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; + const int s = 5; + int i; + + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS); + PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0)); + + for (i = 0; i < NVC0_MAX_BUFFERS; i++) { + if (nvc0->buffers[s][i].buffer) { + struct nv04_resource *res = + nv04_resource(nvc0->buffers[s][i].buffer); + PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); + PUSH_DATA (push, 0); + BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); + util_range_add(&res->valid_buffer_range, + nvc0->buffers[s][i].buffer_offset, + nvc0->buffers[s][i].buffer_offset + + nvc0->buffers[s][i].buffer_size); + } else { + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + } + } +} + +void +nvc0_compute_validate_globals(struct nvc0_context *nvc0) +{ + unsigned i; + + for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *); + ++i) { + struct pipe_resource *res = *util_dynarray_element( + &nvc0->global_residents, struct pipe_resource *, i); + if (res) + nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL, + nv04_resource(res), NOUVEAU_BO_RDWR); + } +} + +static inline void +nvc0_compute_invalidate_surfaces(struct nvc0_context *nvc0, const int s) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + int i; + + for (i = 0; i < NVC0_MAX_IMAGES; ++i) { + if (s == 5) + BEGIN_NVC0(push, NVC0_CP(IMAGE(i)), 6); + else + BEGIN_NVC0(push, NVC0_3D(IMAGE(i)), 6); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0); + PUSH_DATA(push, 0x14000); + PUSH_DATA(push, 0); + } +} + +static void +nvc0_compute_validate_surfaces(struct nvc0_context *nvc0) +{ + /* TODO: Invalidating both 3D and CP surfaces before validating surfaces for + * compute is probably not really necessary, but we didn't find any better + * solutions for now. This fixes some invalidation issues when compute and + * fragment shaders are used inside the same context. Anyway, we definitely + * have invalidation issues between 3D and CP for other resources like SSBO + * and atomic counters. */ + nvc0_compute_invalidate_surfaces(nvc0, 4); + nvc0_compute_invalidate_surfaces(nvc0, 5); + + nvc0_validate_suf(nvc0, 5); + + /* Invalidate all FRAGMENT images because they are aliased with COMPUTE. */ + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_SUF); + nvc0->dirty_3d |= NVC0_NEW_3D_SURFACES; + nvc0->images_dirty[4] |= nvc0->images_valid[4]; +} - return true; +static struct nvc0_state_validate +validate_list_cp[] = { + { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, + { nvc0_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, + { nvc0_compute_validate_driverconst, NVC0_NEW_CP_DRIVERCONST }, + { nvc0_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, + { nvc0_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, + { nvc0_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, + { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, + { nvc0_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, +}; +static bool +nvc0_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) +{ + bool ret; + + ret = nvc0_state_validate(nvc0, mask, validate_list_cp, + ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, + nvc0->bufctx_cp); + + if (unlikely(nvc0->state.flushed)) + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); + return ret; } static void -nvc0_compute_upload_input(struct nvc0_context *nvc0, const void *input) +nvc0_compute_upload_input(struct nvc0_context *nvc0, + const struct pipe_grid_info *info) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_screen *screen = nvc0->screen; struct nvc0_program *cp = nvc0->compprog; if (cp->parm_size) { - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); + struct nouveau_bo *bo = screen->uniform_bo; + const unsigned base = NVC0_CB_USR_INFO(5); + + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); PUSH_DATA (push, align(cp->parm_size, 0x100)); - PUSH_DATAh(push, screen->parm->offset); - PUSH_DATA (push, screen->parm->offset); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); + PUSH_DATAh(push, bo->offset + base); + PUSH_DATA (push, bo->offset + base); + BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); PUSH_DATA (push, (0 << 8) | 1); /* NOTE: size is limited to 4 KiB, which is < NV04_PFIFO_MAX_PACKET_LEN */ - BEGIN_1IC0(push, NVC0_COMPUTE(CB_POS), 1 + cp->parm_size / 4); + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + cp->parm_size / 4); PUSH_DATA (push, 0); - PUSH_DATAp(push, input, cp->parm_size / 4); + PUSH_DATAp(push, info->input, cp->parm_size / 4); - BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); - PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB); + nvc0_compute_invalidate_constbufs(nvc0); } + + BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + + BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 1); + /* (7) as we only upload work_dim on nvc0, the rest uses special regs */ + PUSH_DATA (push, NVC0_CB_AUX_GRID_INFO(7)); + PUSH_DATA (push, info->work_dim); + + BEGIN_NVC0(push, NVC0_CP(FLUSH), 1); + PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB); } void -nvc0_launch_grid(struct pipe_context *pipe, - const uint *block_layout, const uint *grid_layout, - uint32_t label, - const void *input) +nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_program *cp = nvc0->compprog; - unsigned s, i; int ret; - ret = !nvc0_compute_state_validate(nvc0); + ret = !nvc0_state_validate_cp(nvc0, ~0); if (ret) { NOUVEAU_ERR("Failed to launch grid !\n"); return; } - nvc0_compute_upload_input(nvc0, input); + nvc0_compute_upload_input(nvc0, info); - BEGIN_NVC0(push, NVC0_COMPUTE(CP_START_ID), 1); - PUSH_DATA (push, nvc0_program_symbol_offset(cp, label)); + BEGIN_NVC0(push, NVC0_CP(CP_START_ID), 1); + PUSH_DATA (push, nvc0_program_symbol_offset(cp, info->pc)); - BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_POS_ALLOC), 3); - PUSH_DATA (push, align(cp->cp.lmem_size, 0x10)); + BEGIN_NVC0(push, NVC0_CP(LOCAL_POS_ALLOC), 3); + PUSH_DATA (push, (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10)); PUSH_DATA (push, 0); PUSH_DATA (push, 0x800); /* WARP_CSTACK_SIZE */ - BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 3); + BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 3); PUSH_DATA (push, align(cp->cp.smem_size, 0x100)); - PUSH_DATA (push, block_layout[0] * block_layout[1] * block_layout[2]); + PUSH_DATA (push, info->block[0] * info->block[1] * info->block[2]); PUSH_DATA (push, cp->num_barriers); - BEGIN_NVC0(push, NVC0_COMPUTE(CP_GPR_ALLOC), 1); + BEGIN_NVC0(push, NVC0_CP(CP_GPR_ALLOC), 1); PUSH_DATA (push, cp->num_gprs); - /* grid/block setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(GRIDDIM_YX), 2); - PUSH_DATA (push, (grid_layout[1] << 16) | grid_layout[0]); - PUSH_DATA (push, grid_layout[2]); - BEGIN_NVC0(push, NVC0_COMPUTE(BLOCKDIM_YX), 2); - PUSH_DATA (push, (block_layout[1] << 16) | block_layout[0]); - PUSH_DATA (push, block_layout[2]); - /* launch preliminary setup */ - BEGIN_NVC0(push, NVC0_COMPUTE(GRIDID), 1); + BEGIN_NVC0(push, NVC0_CP(GRIDID), 1); PUSH_DATA (push, 0x1); - BEGIN_NVC0(push, SUBC_COMPUTE(0x036c), 1); + BEGIN_NVC0(push, SUBC_CP(0x036c), 1); PUSH_DATA (push, 0); - BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); + BEGIN_NVC0(push, NVC0_CP(FLUSH), 1); PUSH_DATA (push, NVC0_COMPUTE_FLUSH_GLOBAL | NVC0_COMPUTE_FLUSH_UNK8); - /* kernel launching */ - BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_BEGIN), 1); - PUSH_DATA (push, 0); - BEGIN_NVC0(push, SUBC_COMPUTE(0x0a08), 1); - PUSH_DATA (push, 0); - BEGIN_NVC0(push, NVC0_COMPUTE(LAUNCH), 1); - PUSH_DATA (push, 0x1000); - BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_END), 1); - PUSH_DATA (push, 0); - BEGIN_NVC0(push, SUBC_COMPUTE(0x0360), 1); - PUSH_DATA (push, 0x1); - - /* rebind all the 3D constant buffers - * (looks like binding a CB on COMPUTE clobbers 3D state) */ - nvc0->dirty |= NVC0_NEW_CONSTBUF; - for (s = 0; s < 5; s++) { - for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; i++) - if (nvc0->constbuf[s][i].u.buf) - nvc0->constbuf_dirty[s] |= 1 << i; + /* block setup */ + BEGIN_NVC0(push, NVC0_CP(BLOCKDIM_YX), 2); + PUSH_DATA (push, (info->block[1] << 16) | info->block[0]); + PUSH_DATA (push, info->block[2]); + + if (unlikely(info->indirect)) { + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + unsigned macro = NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT; + + nouveau_pushbuf_space(push, 16, 0, 1); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + PUSH_DATA(push, NVC0_FIFO_PKHDR_1I(1, macro, 3)); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); + } else { + /* grid setup */ + BEGIN_NVC0(push, NVC0_CP(GRIDDIM_YX), 2); + PUSH_DATA (push, (info->grid[1] << 16) | info->grid[0]); + PUSH_DATA (push, info->grid[2]); + + /* kernel launching */ + BEGIN_NVC0(push, NVC0_CP(COMPUTE_BEGIN), 1); + PUSH_DATA (push, 0); + BEGIN_NVC0(push, SUBC_CP(0x0a08), 1); + PUSH_DATA (push, 0); + BEGIN_NVC0(push, NVC0_CP(LAUNCH), 1); + PUSH_DATA (push, 0x1000); + BEGIN_NVC0(push, NVC0_CP(COMPUTE_END), 1); + PUSH_DATA (push, 0); + BEGIN_NVC0(push, SUBC_CP(0x0360), 1); + PUSH_DATA (push, 0x1); } - memset(nvc0->state.uniform_buffer_bound, 0, - sizeof(nvc0->state.uniform_buffer_bound)); + + /* TODO: Not sure if this is really necessary. */ + nvc0_compute_invalidate_surfaces(nvc0, 5); + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); + nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES; + nvc0->images_dirty[5] |= nvc0->images_valid[5]; }