From a79521d497bc87309cadc49f3a414703497522bc Mon Sep 17 00:00:00 2001 From: Luca Barbieri Date: Wed, 20 Jan 2010 09:04:37 +0100 Subject: [PATCH] nvfx: use dynamically sized rotating BO pool for fragment programs Currently we used a single buffer for each fragment programs, leading to rendering synchronization. This patch uses a doubly linked list of BOs, which is dynamically resized if all the BOs are busy. Note that inline image transfers could be an alternative option: this will be explored later. This removes one of the big performance limitations of the current driver. We also stop using pipe_resource internally in favor of using nouveau_bo directly. --- src/gallium/drivers/nvfx/nvfx_fragprog.c | 232 ++++++++++++----------- src/gallium/drivers/nvfx/nvfx_state.h | 15 +- 2 files changed, 137 insertions(+), 110 deletions(-) diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c index 301ad82c08b..5fa825ad05d 100644 --- a/src/gallium/drivers/nvfx/nvfx_fragprog.c +++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c @@ -823,45 +823,18 @@ out_err: FREE(fpc); } -static void -nvfx_fragprog_upload(struct nvfx_context *nvfx, - struct nvfx_fragment_program *fp) +static inline void +nvfx_fp_memcpy(void* dst, const void* src, size_t len) { - struct pipe_context *pipe = &nvfx->pipe; - const uint32_t le = 1; - -#if 0 - for (i = 0; i < fp->insn_len; i++) { - fflush(stdout); fflush(stderr); - NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]); - fflush(stdout); fflush(stderr); +#ifndef WORDS_BIGENDIAN + memcpy(dst, src, len); +#else + size_t i; + for(i = 0; i < len; i += 4) { + uint32_t v = (uint32_t*)((char*)src + i); + *(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16); } #endif - - if ((*(const uint8_t *)&le)) { - /* Can do this with an inline transfer */ - pipe_buffer_write(pipe, - fp->buffer, - 0, - fp->insn_len * sizeof fp->insn[0], - fp->insn); - } else { - struct pipe_transfer *transfer; - uint32_t *map; - int i; - - map = pipe_buffer_map(pipe, fp->buffer, - PIPE_TRANSFER_WRITE, - &transfer); - - /* Weird swapping for big-endian chips */ - for (i = 0; i < fp->insn_len; i++) { - map[i] = ((fp->insn[i] & 0xffff) << 16) | - ((fp->insn[i] >> 16) & 0xffff); - } - - pipe_buffer_unmap(pipe, fp->buffer, transfer); - } } void @@ -869,83 +842,118 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) { struct nouveau_channel* chan = nvfx->screen->base.channel; struct nvfx_fragment_program *fp = nvfx->fragprog; - struct pipe_resource *constbuf = - nvfx->constbuf[PIPE_SHADER_FRAGMENT]; - struct pipe_screen *pscreen = nvfx->pipe.screen; - boolean new_consts = FALSE; + int update = 0; int i; - if (fp->translated) - goto update_constants; + if (!fp->translated) + { + nvfx_fragprog_translate(nvfx, fp); + if (!fp->translated) { + static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0}; + static int warned = 0; + if(!warned) + { + fprintf(stderr, "nvfx: failed to translate fragment program!\n"); + warned = 1; + } - nvfx_fragprog_translate(nvfx, fp); - if (!fp->translated) { - static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0}; - static int warned = 0; - if(!warned) - { - fprintf(stderr, "nvfx: failed to translate fragment program!\n"); - warned = 1; + /* use dummy program: we cannot fail here */ + fp->translated = TRUE; + fp->insn = malloc(sizeof(dummy)); + memcpy(fp->insn, dummy, sizeof(dummy)); + fp->insn_len = sizeof(dummy) / sizeof(dummy[0]); } + update = TRUE; - /* use a dummy program: we cannot fail here */ - fp->translated = TRUE; - fp->insn = malloc(sizeof(dummy)); - memcpy(fp->insn, dummy, sizeof(dummy)); - fp->insn_len = sizeof(dummy) / sizeof(dummy[0]); - return; + fp->prog_size = (fp->insn_len * 4 + 63) & ~63; + + int min_size = 4096; + if(fp->prog_size >= min_size) + fp->progs_per_bo = 1; + else + fp->progs_per_bo = min_size / fp->prog_size; + fp->bo_prog_idx = fp->progs_per_bo - 1; } - fp->buffer = pipe_buffer_create(pscreen, - /* XXX: no alignment, maybe use a priv bind flag - * 0x100, - */ - 0, fp->insn_len * 4); - nvfx_fragprog_upload(nvfx, fp); - -update_constants: - if (fp->nr_consts) { - struct pipe_transfer *transfer; - float *map; - - map = pipe_buffer_map(&nvfx->pipe, constbuf, - PIPE_TRANSFER_READ, - &transfer); - - /* XXX: probably a bad idea to be reading back data - * from a buffer the gpu has been using. Not really - * sure what this code is doing though, or how to - * avoid it - kw. - */ - for (i = 0; i < fp->nr_consts; i++) { - struct nvfx_fragment_program_data *fpd = &fp->consts[i]; - uint32_t *p = &fp->insn[fpd->offset]; - uint32_t *cb = (uint32_t *)&map[fpd->index * 4]; - - if (!memcmp(p, cb, 4 * sizeof(float))) - continue; - memcpy(p, cb, 4 * sizeof(float)); - new_consts = TRUE; + if (nvfx->dirty & NVFX_NEW_FRAGCONST) + update = TRUE; + + if(update) { + ++fp->bo_prog_idx; + if(fp->bo_prog_idx >= fp->progs_per_bo) + { + if(fp->fpbo && !nouveau_bo_busy(fp->fpbo->next->bo, NOUVEAU_BO_WR)) + { + fp->fpbo = fp->fpbo->next; + } + else + { + struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16); + if(fp->fpbo) + { + fpbo->next = fp->fpbo->next; + fp->fpbo->next = fpbo; + } + else + fpbo->next = fpbo; + fp->fpbo = fpbo; + fpbo->bo = 0; + nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo); + nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC); + + char* map = fpbo->bo->map; + char* buf = fpbo->insn; + for(int i = 0; i < fp->progs_per_bo; ++i) + { + memcpy(buf, fp->insn, fp->insn_len * 4); + nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4); + map += fp->prog_size; + buf += fp->prog_size; + } + } + fp->bo_prog_idx = 0; } - pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer); - if (new_consts) - nvfx_fragprog_upload(nvfx, fp); + int offset = fp->bo_prog_idx * fp->prog_size; + + if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) { + struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT]; + // TODO: avoid using transfers, just directly the buffer + struct pipe_transfer* transfer; + // TODO: does this check make any sense, or should we do this unconditionally? + uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer); + uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset); + uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset); + for (i = 0; i < fp->nr_consts; ++i) { + unsigned off = fp->consts[i].offset; + unsigned idx = fp->consts[i].index * 4; + + /* TODO: is checking a good idea? */ + if(memcmp(&buf[off], &map[idx], 4 * sizeof(uint32_t))) { + memcpy(&buf[off], &map[idx], 4 * sizeof(uint32_t)); + nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t)); + } + } + pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer); + } } - MARK_RING(chan, 8, 1); - OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1)); - OUT_RELOC(chan, nvfx_resource(fp->buffer)->bo, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW | - NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0, - NV34TCL_FP_ACTIVE_PROGRAM_DMA1); - OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1)); - OUT_RING(chan, fp->fp_control); - if(!nvfx->is_nv4x) { - OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1)); - OUT_RING(chan, (1<<16)|0x4); - OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1)); - OUT_RING(chan, fp->samplers); + if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) { + int offset = fp->bo_prog_idx * fp->prog_size; + MARK_RING(chan, 8, 1); + OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1)); + OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM | + NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW | + NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0, + NV34TCL_FP_ACTIVE_PROGRAM_DMA1); + OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1)); + OUT_RING(chan, fp->fp_control); + if(!nvfx->is_nv4x) { + OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1)); + OUT_RING(chan, (1<<16)|0x4); + OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1)); + OUT_RING(chan, fp->samplers); + } } } @@ -954,12 +962,13 @@ nvfx_fragprog_relocate(struct nvfx_context *nvfx) { struct nouveau_channel* chan = nvfx->screen->base.channel; struct nvfx_fragment_program *fp = nvfx->fragprog; - struct nouveau_bo* bo = nvfx_resource(fp->buffer)->bo; + struct nouveau_bo* bo = fp->fpbo->bo; + int offset = fp->bo_prog_idx * fp->prog_size; unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART? fp_flags |= NOUVEAU_BO_DUMMY; MARK_RING(chan, 2, 2); OUT_RELOC(chan, bo, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1), fp_flags, 0, 0); - OUT_RELOC(chan, bo, 0, fp_flags | NOUVEAU_BO_LOW | + OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0, NV34TCL_FP_ACTIVE_PROGRAM_DMA1); } @@ -968,8 +977,19 @@ void nvfx_fragprog_destroy(struct nvfx_context *nvfx, struct nvfx_fragment_program *fp) { - if (fp->buffer) - pipe_resource_reference(&fp->buffer, NULL); + struct nvfx_fragment_program_bo* fpbo = fp->fpbo; + if(fpbo) + { + do + { + struct nvfx_fragment_program_bo* next = fpbo->next; + nouveau_bo_unmap(fpbo->bo); + nouveau_bo_ref(0, &fpbo->bo); + free(fpbo); + fpbo = next; + } + while(fpbo != fp->fpbo); + } if (fp->insn_len) FREE(fp->insn); diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h index 555513a6428..9ceb2577ecc 100644 --- a/src/gallium/drivers/nvfx/nvfx_state.h +++ b/src/gallium/drivers/nvfx/nvfx_state.h @@ -46,6 +46,12 @@ struct nvfx_fragment_program_data { unsigned index; }; +struct nvfx_fragment_program_bo { + struct nvfx_fragment_program_bo* next; + struct nouveau_bo* bo; + char insn[] __attribute__((aligned(16))); +}; + struct nvfx_fragment_program { struct pipe_shader_state pipe; struct tgsi_shader_info info; @@ -58,12 +64,13 @@ struct nvfx_fragment_program { struct nvfx_fragment_program_data *consts; unsigned nr_consts; - - /* XXX: just use a nouveau_bo for this? - */ - struct pipe_resource *buffer; uint32_t fp_control; + + unsigned bo_prog_idx; + unsigned prog_size; + unsigned progs_per_bo; + struct nvfx_fragment_program_bo* fpbo; }; -- 2.30.2