From: Ben Skeggs Date: Sun, 3 Feb 2008 01:08:31 +0000 (+1100) Subject: nouveau: avoid relocations where possible. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=705022f98c32c44b94411ea13dfe4cbc899f5a77;p=mesa.git nouveau: avoid relocations where possible. Potential relocations are emitted as NOPs where they're needed. In the event a buffer moves, the pushbuf code will emit the relevant state changes into the NOPs. Just a start, more work is needed to get this looking how I want it to. --- diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h b/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h index 7a539c81a94..59febca2929 100644 --- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h +++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h @@ -61,9 +61,8 @@ } while(0) #define OUT_RELOC(buf,data,flags,vor,tor) do { \ - nouveau_pipe_emit_reloc(nv->channel, nv->channel->pushbuf->cur, \ + nouveau_pipe_emit_reloc(nv->channel, nv->channel->pushbuf->cur++, \ buf, (data), (flags), (vor), (tor)); \ - OUT_RING(0); \ } while(0) /* Raw data + flags depending on FB/TT buffer */ diff --git a/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c b/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c index a34a5c18662..7d5eddb92ff 100644 --- a/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c +++ b/src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c @@ -96,6 +96,31 @@ nouveau_pushbuf_init(struct nouveau_channel *chan) return 0; } +static uint32_t +nouveau_pushbuf_calc_reloc(struct nouveau_bo *bo, + struct nouveau_pushbuf_reloc *r) +{ + uint32_t push; + + if (r->flags & NOUVEAU_BO_LOW) { + push = bo->offset + r->data; + } else + if (r->flags & NOUVEAU_BO_HIGH) { + push = (bo->offset + r->data) >> 32; + } else { + push = r->data; + } + + if (r->flags & NOUVEAU_BO_OR) { + if (bo->flags & NOUVEAU_BO_VRAM) + push |= r->vor; + else + push |= r->tor; + } + + return push; +} + /* This would be our TTM "superioctl" */ int nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min) @@ -133,34 +158,20 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min) if (bo->offset == nouveau_bo(bo)->offset && bo->flags == nouveau_bo(bo)->flags) { - /*XXX: could avoid reloc in this case, except with the - * current design we'd confuse the GPU quite a bit - * if we did this. Will fix soon. - */ + while ((r = ptr_to_pbrel(pbbo->relocs))) { + pbbo->relocs = r->next; + free(r); + } + + nvpb->buffers = pbbo->next; + free(pbbo); + continue; } bo->offset = nouveau_bo(bo)->offset; bo->flags = nouveau_bo(bo)->flags; while ((r = ptr_to_pbrel(pbbo->relocs))) { - uint32_t push; - - if (r->flags & NOUVEAU_BO_LOW) { - push = bo->offset + r->data; - } else - if (r->flags & NOUVEAU_BO_HIGH) { - push = (bo->offset + r->data) >> 32; - } else { - push = r->data; - } - - if (r->flags & NOUVEAU_BO_OR) { - if (bo->flags & NOUVEAU_BO_VRAM) - push |= r->vor; - else - push |= r->tor; - } - - *r->ptr = push; + *r->ptr = nouveau_pushbuf_calc_reloc(bo, r); pbbo->relocs = r->next; free(r); } @@ -241,6 +252,10 @@ nouveau_pushbuf_emit_reloc(struct nouveau_channel *chan, void *ptr, r->vor = vor; r->tor = tor; + if (flags & NOUVEAU_BO_DUMMY) + *(uint32_t *)ptr = 0; + else + *(uint32_t *)ptr = nouveau_pushbuf_calc_reloc(bo, r); return 0; } diff --git a/src/mesa/pipe/nouveau/nouveau_bo.h b/src/mesa/pipe/nouveau/nouveau_bo.h index 2b57ee92633..18020e9c652 100644 --- a/src/mesa/pipe/nouveau/nouveau_bo.h +++ b/src/mesa/pipe/nouveau/nouveau_bo.h @@ -35,6 +35,7 @@ #define NOUVEAU_BO_HIGH (1 << 7) #define NOUVEAU_BO_OR (1 << 8) #define NOUVEAU_BO_LOCAL (1 << 9) +#define NOUVEAU_BO_DUMMY (1 << 31) struct nouveau_bo { struct nouveau_device *device; diff --git a/src/mesa/pipe/nouveau/nouveau_push.h b/src/mesa/pipe/nouveau/nouveau_push.h index 117e3535cf7..679472669b9 100644 --- a/src/mesa/pipe/nouveau/nouveau_push.h +++ b/src/mesa/pipe/nouveau/nouveau_push.h @@ -44,9 +44,8 @@ #define OUT_RELOC(bo,data,flags,vor,tor) do { \ NOUVEAU_PUSH_CONTEXT(pc); \ pc->nvws->push_reloc(pc->nvws->channel, \ - pc->nvws->channel->pushbuf->cur, \ + pc->nvws->channel->pushbuf->cur++, \ (bo), (data), (flags), (vor), (tor)); \ - OUT_RING(0); \ } while(0) /* Raw data + flags depending on FB/TT buffer */ @@ -71,4 +70,14 @@ OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0); \ } while(0) +/* A reloc which'll recombine into a NV_DMA_METHOD packet header */ +#define OUT_RELOCm(bo, flags, obj, mthd, size) do { \ + NOUVEAU_PUSH_CONTEXT(pc); \ + if (pc->nvws->channel->pushbuf->remaining < ((size) + 1)) \ + pc->nvws->push_flush(pc->nvws->channel, ((size) + 1)); \ + OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd), \ + (flags), 0, 0); \ + pc->nvws->channel->pushbuf->remaining -= ((size) + 1); \ +} while(0) + #endif diff --git a/src/mesa/pipe/nv40/nv40_fragprog.c b/src/mesa/pipe/nv40/nv40_fragprog.c index 14897f97982..667eb89cb24 100644 --- a/src/mesa/pipe/nv40/nv40_fragprog.c +++ b/src/mesa/pipe/nv40/nv40_fragprog.c @@ -815,6 +815,11 @@ nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp) fp->on_hw = TRUE; } + BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1); + OUT_RELOC (fp->buffer, 0, NOUVEAU_BO_VRAM | + NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW | + NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0, + NV40TCL_FP_ADDRESS_DMA1); BEGIN_RING(curie, NV40TCL_FP_CONTROL, 1); OUT_RING (fp->fp_control); diff --git a/src/mesa/pipe/nv40/nv40_fragtex.c b/src/mesa/pipe/nv40/nv40_fragtex.c index 48d6eb629f5..7c5ecd5c566 100644 --- a/src/mesa/pipe/nv40/nv40_fragtex.c +++ b/src/mesa/pipe/nv40/nv40_fragtex.c @@ -104,7 +104,13 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit) nv40->tex[unit].buffer = nv40mt->buffer; nv40->tex[unit].format = txf; - BEGIN_RING(curie, NV40TCL_TEX_WRAP(unit), 6); + BEGIN_RING(curie, NV40TCL_TEX_OFFSET(unit), 8); + OUT_RELOCl(nv40->tex[unit].buffer, 0, NOUVEAU_BO_VRAM | + NOUVEAU_BO_GART | NOUVEAU_BO_RD); + OUT_RELOCd(nv40->tex[unit].buffer, nv40->tex[unit].format, + NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD | + NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0, + NV40TCL_TEX_FORMAT_DMA1); OUT_RING (ps->wrap); OUT_RING (NV40TCL_TEX_ENABLE_ENABLE | ps->en | (0x00078000) /* mipmap related? */); diff --git a/src/mesa/pipe/nv40/nv40_state.c b/src/mesa/pipe/nv40/nv40_state.c index c619948b55a..bb435b106b4 100644 --- a/src/mesa/pipe/nv40/nv40_state.c +++ b/src/mesa/pipe/nv40/nv40_state.c @@ -603,33 +603,51 @@ nv40_set_framebuffer_state(struct pipe_context *pipe, } if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) { - BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 1); - OUT_RING (rt[0]->pitch * rt[0]->cpp); nv40->rt[0] = rt[0]->buffer; + BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1); + OUT_RELOCo(nv40->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 2); + OUT_RING (rt[0]->pitch * rt[0]->cpp); + OUT_RELOCl(nv40->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); } if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) { - BEGIN_RING(curie, NV40TCL_COLOR1_PITCH, 2); - OUT_RING (rt[1]->pitch * rt[1]->cpp); nv40->rt[1] = rt[1]->buffer; + BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1); + OUT_RELOCo(nv40->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 2); + OUT_RELOCl(nv40->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RING (rt[1]->pitch * rt[1]->cpp); } if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) { + nv40->rt[2] = rt[2]->buffer; + BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1); + OUT_RELOCo(nv40->rt[2], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1); + OUT_RELOCl(nv40->rt[2], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); BEGIN_RING(curie, NV40TCL_COLOR2_PITCH, 1); OUT_RING (rt[2]->pitch * rt[2]->cpp); - nv40->rt[2] = rt[2]->buffer; } if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) { + nv40->rt[3] = rt[3]->buffer; + BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1); + OUT_RELOCo(nv40->rt[3], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1); + OUT_RELOCl(nv40->rt[3], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); BEGIN_RING(curie, NV40TCL_COLOR3_PITCH, 1); OUT_RING (rt[3]->pitch * rt[3]->cpp); - nv40->rt[3] = rt[3]->buffer; } if (zeta_format) { + nv40->zeta = zeta->buffer; + BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1); + OUT_RELOCo(nv40->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1); + OUT_RELOCl(nv40->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); BEGIN_RING(curie, NV40TCL_ZETA_PITCH, 1); OUT_RING (zeta->pitch * zeta->cpp); - nv40->zeta = zeta->buffer; } nv40->rt_enable = rt_enable; diff --git a/src/mesa/pipe/nv40/nv40_state_emit.c b/src/mesa/pipe/nv40/nv40_state_emit.c index c9a7a2e364b..66b98d5fab3 100644 --- a/src/mesa/pipe/nv40/nv40_state_emit.c +++ b/src/mesa/pipe/nv40/nv40_state_emit.c @@ -1,94 +1,114 @@ #include "nv40_context.h" #include "nv40_state.h" -void -nv40_emit_hw_state(struct nv40_context *nv40) +/* Emit relocs for every referenced buffer. + * + * This is to ensure the bufmgr has an accurate idea of how + * the buffer is used. These relocs appear in the push buffer as + * NOPs, and will only be turned into state changes if a buffer + * actually moves. + */ +static void +nv40_state_emit_dummy_relocs(struct nv40_context *nv40) { - int i; - - if (nv40->dirty & NV40_NEW_FRAGPROG) { - nv40_fragprog_bind(nv40, nv40->fragprog.current); - /*XXX: clear NV40_NEW_FRAGPROG if no new program uploaded */ - } - - if (nv40->dirty_samplers || (nv40->dirty & NV40_NEW_FRAGPROG)) { - nv40_fragtex_bind(nv40); - - BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); - OUT_RING (2); - BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); - OUT_RING (1); - nv40->dirty &= ~NV40_NEW_FRAGPROG; - } - - if (nv40->dirty & NV40_NEW_VERTPROG) { - nv40_vertprog_bind(nv40, nv40->vertprog.current); - nv40->dirty &= ~NV40_NEW_VERTPROG; - } - - nv40->dirty_samplers = 0; - - /* Emit relocs for every referenced buffer. - * This is to ensure the bufmgr has an accurate idea of how - * the buffer is used. This isn't very efficient, but we don't - * seem to take a significant performance hit. Will be improved - * at some point. Vertex arrays are emitted by nv40_vbo.c - */ + unsigned rt_flags, tx_flags, fp_flags; + int i; + + rt_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR | NOUVEAU_BO_DUMMY; + tx_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD | + NOUVEAU_BO_DUMMY; + fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD | + NOUVEAU_BO_DUMMY; /* Render targets */ if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR0) { - BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1); - OUT_RELOCo(nv40->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - BEGIN_RING(curie, NV40TCL_COLOR0_OFFSET, 1); - OUT_RELOCl(nv40->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCm(nv40->rt[0], rt_flags, + curie, NV40TCL_DMA_COLOR0, 1); + OUT_RELOCo(nv40->rt[0], rt_flags); + OUT_RELOCm(nv40->rt[0], rt_flags, + curie, NV40TCL_COLOR0_OFFSET, 1); + OUT_RELOCl(nv40->rt[0], 0, rt_flags); } if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR1) { - BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1); - OUT_RELOCo(nv40->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 1); - OUT_RELOCl(nv40->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCm(nv40->rt[1], rt_flags, + curie, NV40TCL_DMA_COLOR1, 1); + OUT_RELOCo(nv40->rt[1], rt_flags); + OUT_RELOCm(nv40->rt[1], rt_flags, + curie, NV40TCL_COLOR1_OFFSET, 1); + OUT_RELOCl(nv40->rt[1], 0, rt_flags); } if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR2) { - BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1); - OUT_RELOCo(nv40->rt[2], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1); - OUT_RELOCl(nv40->rt[2], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCm(nv40->rt[2], rt_flags, + curie, NV40TCL_DMA_COLOR2, 1); + OUT_RELOCo(nv40->rt[2], rt_flags); + OUT_RELOCm(nv40->rt[2], rt_flags, + curie, NV40TCL_COLOR2_OFFSET, 1); + OUT_RELOCl(nv40->rt[2], 0, rt_flags); } if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR3) { - BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1); - OUT_RELOCo(nv40->rt[3], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1); - OUT_RELOCl(nv40->rt[3], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCm(nv40->rt[3], rt_flags, + curie, NV40TCL_DMA_COLOR3, 1); + OUT_RELOCo(nv40->rt[3], rt_flags); + OUT_RELOCm(nv40->rt[3], rt_flags, + curie, NV40TCL_COLOR3_OFFSET, 1); + OUT_RELOCl(nv40->rt[3], 0, rt_flags); } if (nv40->zeta) { - BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1); - OUT_RELOCo(nv40->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1); - OUT_RELOCl(nv40->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_DMA_ZETA, 1); + OUT_RELOCo(nv40->zeta, rt_flags); + OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_ZETA_OFFSET, 1); + OUT_RELOCl(nv40->zeta, 0, rt_flags); } /* Texture images */ for (i = 0; i < 16; i++) { if (!(nv40->fp_samplers & (1 << i))) continue; - BEGIN_RING(curie, NV40TCL_TEX_OFFSET(i), 2); - OUT_RELOCl(nv40->tex[i].buffer, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_GART | NOUVEAU_BO_RD); + OUT_RELOCm(nv40->tex[i].buffer, tx_flags, + curie, NV40TCL_TEX_OFFSET(i), 2); + OUT_RELOCl(nv40->tex[i].buffer, 0, tx_flags); OUT_RELOCd(nv40->tex[i].buffer, nv40->tex[i].format, - NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD | - NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0, + tx_flags | NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1); } /* Fragment program */ - BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1); - OUT_RELOC (nv40->fragprog.active->buffer, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW | - NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0, - NV40TCL_FP_ADDRESS_DMA1); + OUT_RELOCm(nv40->fragprog.active->buffer, fp_flags, + curie, NV40TCL_FP_ADDRESS, 1); + OUT_RELOC (nv40->fragprog.active->buffer, 0, + fp_flags | NOUVEAU_BO_OR | NOUVEAU_BO_LOW, + NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1); +} + +void +nv40_emit_hw_state(struct nv40_context *nv40) +{ + if (nv40->dirty & NV40_NEW_FRAGPROG) { + nv40_fragprog_bind(nv40, nv40->fragprog.current); + /*XXX: clear NV40_NEW_FRAGPROG if no new program uploaded */ + } + + if (nv40->dirty_samplers || (nv40->dirty & NV40_NEW_FRAGPROG)) { + nv40_fragtex_bind(nv40); + + BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); + OUT_RING (2); + BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1); + OUT_RING (1); + nv40->dirty &= ~NV40_NEW_FRAGPROG; + } + + if (nv40->dirty & NV40_NEW_VERTPROG) { + nv40_vertprog_bind(nv40, nv40->vertprog.current); + nv40->dirty &= ~NV40_NEW_VERTPROG; + } + + nv40->dirty_samplers = 0; + + nv40_state_emit_dummy_relocs(nv40); }