nouveau: avoid relocations where possible.
authorBen Skeggs <skeggsb@gmail.com>
Sun, 3 Feb 2008 01:08:31 +0000 (12:08 +1100)
committerBen Skeggs <skeggsb@gmail.com>
Sun, 3 Feb 2008 01:08:49 +0000 (12:08 +1100)
Potential relocations are emitted as NOPs where they're needed.  In the
event a buffer moves, the pushbuf code will emit the relevant state
changes into the NOPs.

Just a start, more work is needed to get this looking how I want it to.

src/mesa/drivers/dri/nouveau_winsys/nouveau_local.h
src/mesa/drivers/dri/nouveau_winsys/nouveau_pushbuf.c
src/mesa/pipe/nouveau/nouveau_bo.h
src/mesa/pipe/nouveau/nouveau_push.h
src/mesa/pipe/nv40/nv40_fragprog.c
src/mesa/pipe/nv40/nv40_fragtex.c
src/mesa/pipe/nv40/nv40_state.c
src/mesa/pipe/nv40/nv40_state_emit.c

index 7a539c81a943f7bf93549f162a89c815aa6848ae..59febca2929af70208311ae69b5f2f719a5e6d93 100644 (file)
@@ -61,9 +61,8 @@
 } while(0)
 
 #define OUT_RELOC(buf,data,flags,vor,tor) do {                                 \
-       nouveau_pipe_emit_reloc(nv->channel, nv->channel->pushbuf->cur,        \
+       nouveau_pipe_emit_reloc(nv->channel, nv->channel->pushbuf->cur++,      \
                                   buf, (data), (flags), (vor), (tor));        \
-       OUT_RING(0);                                                           \
 } while(0)
 
 /* Raw data + flags depending on FB/TT buffer */
index a34a5c18662b26d6e2499c455b6f64f9ddd461e0..7d5eddb92ff81bc41500125437f94f92294f09d6 100644 (file)
@@ -96,6 +96,31 @@ nouveau_pushbuf_init(struct nouveau_channel *chan)
        return 0;
 }
 
+static uint32_t
+nouveau_pushbuf_calc_reloc(struct nouveau_bo *bo,
+                          struct nouveau_pushbuf_reloc *r)
+{
+       uint32_t push;
+
+       if (r->flags & NOUVEAU_BO_LOW) {
+               push = bo->offset + r->data;
+       } else
+       if (r->flags & NOUVEAU_BO_HIGH) {
+               push = (bo->offset + r->data) >> 32;
+       } else {
+               push = r->data;
+       }
+
+       if (r->flags & NOUVEAU_BO_OR) {
+               if (bo->flags & NOUVEAU_BO_VRAM)
+                       push |= r->vor;
+               else
+                       push |= r->tor;
+       }
+
+       return push;
+}
+
 /* This would be our TTM "superioctl" */
 int
 nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min)
@@ -133,34 +158,20 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min)
 
                if (bo->offset == nouveau_bo(bo)->offset &&
                    bo->flags == nouveau_bo(bo)->flags) {
-                       /*XXX: could avoid reloc in this case, except with the
-                        *     current design we'd confuse the GPU quite a bit
-                        *     if we did this.  Will fix soon.
-                        */
+                       while ((r = ptr_to_pbrel(pbbo->relocs))) {
+                               pbbo->relocs = r->next;
+                               free(r);
+                       }
+
+                       nvpb->buffers = pbbo->next;
+                       free(pbbo);
+                       continue;
                }
                bo->offset = nouveau_bo(bo)->offset;
                bo->flags = nouveau_bo(bo)->flags;
 
                while ((r = ptr_to_pbrel(pbbo->relocs))) {
-                       uint32_t push;
-
-                       if (r->flags & NOUVEAU_BO_LOW) {
-                               push = bo->offset + r->data;
-                       } else
-                       if (r->flags & NOUVEAU_BO_HIGH) {
-                               push = (bo->offset + r->data) >> 32;
-                       } else {
-                               push = r->data;
-                       }
-
-                       if (r->flags & NOUVEAU_BO_OR) {
-                               if (bo->flags & NOUVEAU_BO_VRAM)
-                                       push |= r->vor;
-                               else
-                                       push |= r->tor;
-                       }
-
-                       *r->ptr = push;
+                       *r->ptr = nouveau_pushbuf_calc_reloc(bo, r);
                        pbbo->relocs = r->next;
                        free(r);
                }
@@ -241,6 +252,10 @@ nouveau_pushbuf_emit_reloc(struct nouveau_channel *chan, void *ptr,
        r->vor = vor;
        r->tor = tor;
 
+       if (flags & NOUVEAU_BO_DUMMY)
+               *(uint32_t *)ptr = 0;
+       else
+               *(uint32_t *)ptr = nouveau_pushbuf_calc_reloc(bo, r);
        return 0;
 }
 
index 2b57ee926338ca9d20c3fbc2fa71e7e0f46b87a4..18020e9c65212bb591d789b4ef52b9bd7c185c5b 100644 (file)
@@ -35,6 +35,7 @@
 #define NOUVEAU_BO_HIGH  (1 << 7)
 #define NOUVEAU_BO_OR    (1 << 8)
 #define NOUVEAU_BO_LOCAL (1 << 9)
+#define NOUVEAU_BO_DUMMY (1 << 31)
 
 struct nouveau_bo {
        struct nouveau_device *device;
index 117e3535cf70bf861fcf737f8396bc1e2c6cf49e..679472669b922a6aecac8b6a4fdcf1cb223308e3 100644 (file)
@@ -44,9 +44,8 @@
 #define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
        NOUVEAU_PUSH_CONTEXT(pc);                                              \
        pc->nvws->push_reloc(pc->nvws->channel,                                \
-                            pc->nvws->channel->pushbuf->cur,                  \
+                            pc->nvws->channel->pushbuf->cur++,                \
                             (bo), (data), (flags), (vor), (tor));             \
-       OUT_RING(0);                                                           \
 } while(0)
 
 /* Raw data + flags depending on FB/TT buffer */
        OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
 } while(0)
 
+/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
+#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
+       NOUVEAU_PUSH_CONTEXT(pc);                                              \
+       if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+               pc->nvws->push_flush(pc->nvws->channel, ((size) + 1));         \
+       OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
+                  (flags), 0, 0);                                             \
+       pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
 #endif
index 714634396d22267370f03165b9be4e6e2e554584..ef320393d943b882f98204c14c9478450503a48a 100644 (file)
@@ -815,6 +815,11 @@ nv40_fragprog_bind(struct nv40_context *nv40, struct nv40_fragment_program *fp)
                fp->on_hw = TRUE;
        }
 
+       BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1);
+       OUT_RELOC (fp->buffer, 0, NOUVEAU_BO_VRAM |
+                  NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+                  NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0,
+                  NV40TCL_FP_ADDRESS_DMA1);
        BEGIN_RING(curie, NV40TCL_FP_CONTROL, 1);
        OUT_RING  (fp->fp_control);
 
index 48d6eb629f5da5c902dfa7f8e7ecd5d3c77a0c33..7c5ecd5c566cd942b0a53a630e9e623cea19f5ff 100644 (file)
@@ -104,7 +104,13 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
        nv40->tex[unit].buffer = nv40mt->buffer;
        nv40->tex[unit].format = txf;
 
-       BEGIN_RING(curie, NV40TCL_TEX_WRAP(unit), 6);
+       BEGIN_RING(curie, NV40TCL_TEX_OFFSET(unit), 8);
+       OUT_RELOCl(nv40->tex[unit].buffer, 0, NOUVEAU_BO_VRAM |
+                  NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+       OUT_RELOCd(nv40->tex[unit].buffer, nv40->tex[unit].format,
+                  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+                  NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
+                  NV40TCL_TEX_FORMAT_DMA1);
        OUT_RING  (ps->wrap);
        OUT_RING  (NV40TCL_TEX_ENABLE_ENABLE | ps->en |
                   (0x00078000) /* mipmap related? */);
index c619948b55aa07e9e2b54e3e7b6ca43421109160..bb435b106b455dd921167d8c1f5acb0f28f3c8e1 100644 (file)
@@ -603,33 +603,51 @@ nv40_set_framebuffer_state(struct pipe_context *pipe,
        }
 
        if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-               BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 1);
-               OUT_RING  (rt[0]->pitch * rt[0]->cpp);
                nv40->rt[0] = rt[0]->buffer;
+               BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1);
+               OUT_RELOCo(nv40->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               BEGIN_RING(curie, NV40TCL_COLOR0_PITCH, 2);
+               OUT_RING  (rt[0]->pitch * rt[0]->cpp);
+               OUT_RELOCl(nv40->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
        }
 
        if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-               BEGIN_RING(curie, NV40TCL_COLOR1_PITCH, 2);
-               OUT_RING  (rt[1]->pitch * rt[1]->cpp);
                nv40->rt[1] = rt[1]->buffer;
+               BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
+               OUT_RELOCo(nv40->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 2);
+               OUT_RELOCl(nv40->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               OUT_RING  (rt[1]->pitch * rt[1]->cpp);
        }
 
        if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+               nv40->rt[2] = rt[2]->buffer;
+               BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1);
+               OUT_RELOCo(nv40->rt[2], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1);
+               OUT_RELOCl(nv40->rt[2], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
                BEGIN_RING(curie, NV40TCL_COLOR2_PITCH, 1);
                OUT_RING  (rt[2]->pitch * rt[2]->cpp);
-               nv40->rt[2] = rt[2]->buffer;
        }
 
        if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+               nv40->rt[3] = rt[3]->buffer;
+               BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1);
+               OUT_RELOCo(nv40->rt[3], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1);
+               OUT_RELOCl(nv40->rt[3], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
                BEGIN_RING(curie, NV40TCL_COLOR3_PITCH, 1);
                OUT_RING  (rt[3]->pitch * rt[3]->cpp);
-               nv40->rt[3] = rt[3]->buffer;
        }
 
        if (zeta_format) {
+               nv40->zeta = zeta->buffer;
+               BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1);
+               OUT_RELOCo(nv40->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1);
+               OUT_RELOCl(nv40->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
                BEGIN_RING(curie, NV40TCL_ZETA_PITCH, 1);
                OUT_RING  (zeta->pitch * zeta->cpp);
-               nv40->zeta = zeta->buffer;
        }
 
        nv40->rt_enable = rt_enable;
index c9a7a2e364b4cf738618df8d878d58c9bbd4ce96..66b98d5fab3a2e7f4f18fb7cbebe71e42e6d6f48 100644 (file)
 #include "nv40_context.h"
 #include "nv40_state.h"
 
-void
-nv40_emit_hw_state(struct nv40_context *nv40)
+/* Emit relocs for every referenced buffer.
+ *
+ * This is to ensure the bufmgr has an accurate idea of how
+ * the buffer is used.  These relocs appear in the push buffer as
+ * NOPs, and will only be turned into state changes if a buffer
+ * actually moves.
+ */
+static void
+nv40_state_emit_dummy_relocs(struct nv40_context *nv40)
 {
-       int i;
-
-       if (nv40->dirty & NV40_NEW_FRAGPROG) {
-               nv40_fragprog_bind(nv40, nv40->fragprog.current);
-               /*XXX: clear NV40_NEW_FRAGPROG if no new program uploaded */
-       }
-
-       if (nv40->dirty_samplers || (nv40->dirty & NV40_NEW_FRAGPROG)) {
-               nv40_fragtex_bind(nv40);
-
-               BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-               OUT_RING  (2);
-               BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-               OUT_RING  (1);
-               nv40->dirty &= ~NV40_NEW_FRAGPROG;
-       }
-
-       if (nv40->dirty & NV40_NEW_VERTPROG) {
-               nv40_vertprog_bind(nv40, nv40->vertprog.current);
-               nv40->dirty &= ~NV40_NEW_VERTPROG;
-       }
-
-       nv40->dirty_samplers = 0;
-
-       /* Emit relocs for every referenced buffer.
-        * This is to ensure the bufmgr has an accurate idea of how
-        * the buffer is used.  This isn't very efficient, but we don't
-        * seem to take a significant performance hit.  Will be improved
-        * at some point.  Vertex arrays are emitted by nv40_vbo.c
-        */
+       unsigned rt_flags, tx_flags, fp_flags;
+       int i;  
+       
+       rt_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR | NOUVEAU_BO_DUMMY;
+       tx_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+                  NOUVEAU_BO_DUMMY;
+       fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+                  NOUVEAU_BO_DUMMY;
 
        /* Render targets */
        if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
-               BEGIN_RING(curie, NV40TCL_DMA_COLOR0, 1);
-               OUT_RELOCo(nv40->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-               BEGIN_RING(curie, NV40TCL_COLOR0_OFFSET, 1);
-               OUT_RELOCl(nv40->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               OUT_RELOCm(nv40->rt[0], rt_flags,
+                          curie, NV40TCL_DMA_COLOR0, 1);
+               OUT_RELOCo(nv40->rt[0], rt_flags);
+               OUT_RELOCm(nv40->rt[0], rt_flags,
+                          curie, NV40TCL_COLOR0_OFFSET, 1);
+               OUT_RELOCl(nv40->rt[0], 0, rt_flags);
        }
 
        if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
-               BEGIN_RING(curie, NV40TCL_DMA_COLOR1, 1);
-               OUT_RELOCo(nv40->rt[1], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-               BEGIN_RING(curie, NV40TCL_COLOR1_OFFSET, 1);
-               OUT_RELOCl(nv40->rt[1], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               OUT_RELOCm(nv40->rt[1], rt_flags,
+                          curie, NV40TCL_DMA_COLOR1, 1);
+               OUT_RELOCo(nv40->rt[1], rt_flags);
+               OUT_RELOCm(nv40->rt[1], rt_flags,
+                          curie, NV40TCL_COLOR1_OFFSET, 1);
+               OUT_RELOCl(nv40->rt[1], 0, rt_flags);
        }
 
        if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
-               BEGIN_RING(curie, NV40TCL_DMA_COLOR2, 1);
-               OUT_RELOCo(nv40->rt[2], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-               BEGIN_RING(curie, NV40TCL_COLOR2_OFFSET, 1);
-               OUT_RELOCl(nv40->rt[2], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               OUT_RELOCm(nv40->rt[2], rt_flags,
+                          curie, NV40TCL_DMA_COLOR2, 1);
+               OUT_RELOCo(nv40->rt[2], rt_flags);
+               OUT_RELOCm(nv40->rt[2], rt_flags,
+                          curie, NV40TCL_COLOR2_OFFSET, 1);
+               OUT_RELOCl(nv40->rt[2], 0, rt_flags);
        }
 
        if (nv40->rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
-               BEGIN_RING(curie, NV40TCL_DMA_COLOR3, 1);
-               OUT_RELOCo(nv40->rt[3], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-               BEGIN_RING(curie, NV40TCL_COLOR3_OFFSET, 1);
-               OUT_RELOCl(nv40->rt[3], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               OUT_RELOCm(nv40->rt[3], rt_flags,
+                          curie, NV40TCL_DMA_COLOR3, 1);
+               OUT_RELOCo(nv40->rt[3], rt_flags);
+               OUT_RELOCm(nv40->rt[3], rt_flags,
+                          curie, NV40TCL_COLOR3_OFFSET, 1);
+               OUT_RELOCl(nv40->rt[3], 0, rt_flags);
        }
 
        if (nv40->zeta) {
-               BEGIN_RING(curie, NV40TCL_DMA_ZETA, 1);
-               OUT_RELOCo(nv40->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-               BEGIN_RING(curie, NV40TCL_ZETA_OFFSET, 1);
-               OUT_RELOCl(nv40->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+               OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_DMA_ZETA, 1);
+               OUT_RELOCo(nv40->zeta, rt_flags);
+               OUT_RELOCm(nv40->zeta, rt_flags, curie, NV40TCL_ZETA_OFFSET, 1);
+               OUT_RELOCl(nv40->zeta, 0, rt_flags);
        }
 
        /* Texture images */
        for (i = 0; i < 16; i++) {
                if (!(nv40->fp_samplers & (1 << i)))
                        continue;
-               BEGIN_RING(curie, NV40TCL_TEX_OFFSET(i), 2);
-               OUT_RELOCl(nv40->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
-                          NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+               OUT_RELOCm(nv40->tex[i].buffer, tx_flags,
+                          curie, NV40TCL_TEX_OFFSET(i), 2);
+               OUT_RELOCl(nv40->tex[i].buffer, 0, tx_flags);
                OUT_RELOCd(nv40->tex[i].buffer, nv40->tex[i].format,
-                          NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
-                          NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
+                          tx_flags | NOUVEAU_BO_OR, NV40TCL_TEX_FORMAT_DMA0,
                           NV40TCL_TEX_FORMAT_DMA1);
        }
 
        /* Fragment program */
-       BEGIN_RING(curie, NV40TCL_FP_ADDRESS, 1);
-       OUT_RELOC (nv40->fragprog.active->buffer, 0, NOUVEAU_BO_VRAM |
-                  NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
-                  NOUVEAU_BO_OR, NV40TCL_FP_ADDRESS_DMA0,
-                  NV40TCL_FP_ADDRESS_DMA1);
+       OUT_RELOCm(nv40->fragprog.active->buffer, fp_flags,
+                  curie, NV40TCL_FP_ADDRESS, 1);
+       OUT_RELOC (nv40->fragprog.active->buffer, 0,
+                  fp_flags | NOUVEAU_BO_OR | NOUVEAU_BO_LOW,
+                  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+}
+
+void
+nv40_emit_hw_state(struct nv40_context *nv40)
+{
+       if (nv40->dirty & NV40_NEW_FRAGPROG) {
+               nv40_fragprog_bind(nv40, nv40->fragprog.current);
+               /*XXX: clear NV40_NEW_FRAGPROG if no new program uploaded */
+       }
+
+       if (nv40->dirty_samplers || (nv40->dirty & NV40_NEW_FRAGPROG)) {
+               nv40_fragtex_bind(nv40);
+
+               BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+               OUT_RING  (2);
+               BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+               OUT_RING  (1);
+               nv40->dirty &= ~NV40_NEW_FRAGPROG;
+       }
+
+       if (nv40->dirty & NV40_NEW_VERTPROG) {
+               nv40_vertprog_bind(nv40, nv40->vertprog.current);
+               nv40->dirty &= ~NV40_NEW_VERTPROG;
+       }
+
+       nv40->dirty_samplers = 0;
+
+       nv40_state_emit_dummy_relocs(nv40);
 }