llvmpipe: Remove unnecessary headers.
[mesa.git] / src / gallium / drivers / nv50 / nv50_vbo.c
index f7fa0659e8c68490bc2ed74e7ceac46fccd095ee..ca2f8061f321da5273560afe207422877c54168e 100644 (file)
@@ -22,7 +22,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "pipe/p_inlines.h"
+#include "util/u_inlines.h"
 
 #include "util/u_format.h"
 
@@ -40,6 +40,8 @@ nv50_push_elements_u32(struct nv50_context *, uint32_t *, unsigned);
 static boolean
 nv50_push_arrays(struct nv50_context *, unsigned, unsigned);
 
+#define NV50_USING_LOATHED_EDGEFLAG(ctx) ((ctx)->vertprog->cfg.edgeflag_in < 16)
+
 static INLINE unsigned
 nv50_prim(unsigned mode)
 {
@@ -55,6 +57,14 @@ nv50_prim(unsigned mode)
        case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
        case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
        case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+       case PIPE_PRIM_LINES_ADJACENCY:
+               return NV50TCL_VERTEX_BEGIN_LINES_ADJACENCY;
+       case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+               return NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY;
+       case PIPE_PRIM_TRIANGLES_ADJACENCY:
+               return NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY;
+       case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+               return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY;
        default:
                break;
        }
@@ -99,19 +109,19 @@ nv50_vbo_size_to_hw(unsigned size, unsigned nr_c)
 {
        static const uint32_t hw_values[] = {
                0, 0, 0, 0,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8_8,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16_16,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16,
                0, 0, 0, 0,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32,
-               NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32_32 };
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32,
+               NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 };
 
        /* we'd also have R11G11B10 and R10G10B10A2 */
 
@@ -152,7 +162,310 @@ nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
        return (hw_type | hw_size);
 }
 
-boolean
+/* For instanced drawing from user buffers, hitting the FIFO repeatedly
+ * with the same vertex data is probably worse than uploading all data.
+ */
+static boolean
+nv50_upload_vtxbuf(struct nv50_context *nv50, unsigned i)
+{
+       struct nv50_screen *nscreen = nv50->screen;
+       struct pipe_screen *pscreen = &nscreen->base.base;
+       struct pipe_buffer *buf = nscreen->strm_vbuf[i];
+       struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
+       uint8_t *src;
+       unsigned size = align(vb->buffer->size, 4096);
+
+       if (buf && buf->size < size)
+               pipe_buffer_reference(&nscreen->strm_vbuf[i], NULL);
+
+       if (!nscreen->strm_vbuf[i]) {
+               nscreen->strm_vbuf[i] = pipe_buffer_create(
+                       pscreen, 0, PIPE_BUFFER_USAGE_VERTEX, size);
+               buf = nscreen->strm_vbuf[i];
+       }
+
+       src = pipe_buffer_map(pscreen, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+       if (!src)
+               return FALSE;
+       src += vb->buffer_offset;
+
+       size = (vb->max_index + 1) * vb->stride + 16; /* + 16 is for stride 0 */
+       if (vb->buffer_offset + size > vb->buffer->size)
+               size = vb->buffer->size - vb->buffer_offset;
+
+       pipe_buffer_write(pscreen, buf, vb->buffer_offset, size, src);
+       pipe_buffer_unmap(pscreen, vb->buffer);
+
+       vb->buffer = buf; /* don't pipe_reference, this is a private copy */
+       return TRUE;
+}
+
+static void
+nv50_upload_user_vbufs(struct nv50_context *nv50)
+{
+       unsigned i;
+
+       if (nv50->vbo_fifo)
+               nv50->dirty |= NV50_NEW_ARRAYS;
+       if (!(nv50->dirty & NV50_NEW_ARRAYS))
+               return;
+
+       for (i = 0; i < nv50->vtxbuf_nr; ++i) {
+               if (nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX)
+                       continue;
+               nv50_upload_vtxbuf(nv50, i);
+       }
+}
+
+static void
+nv50_set_static_vtxattr(struct nv50_context *nv50, unsigned i, void *data)
+{
+       struct nouveau_grobj *tesla = nv50->screen->tesla;
+       struct nouveau_channel *chan = tesla->channel;
+       float v[4];
+
+       util_format_read_4f(nv50->vtxelt[i].src_format,
+                           v, 0, data, 0, 0, 0, 1, 1);
+
+       switch (nv50->vtxelt[i].nr_components) {
+       case 4:
+               BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_4F_X(i), 4);
+               OUT_RINGf (chan, v[0]);
+               OUT_RINGf (chan, v[1]);
+               OUT_RINGf (chan, v[2]);
+               OUT_RINGf (chan, v[3]);
+               break;
+       case 3:
+               BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_3F_X(i), 3);
+               OUT_RINGf (chan, v[0]);
+               OUT_RINGf (chan, v[1]);
+               OUT_RINGf (chan, v[2]);
+               break;
+       case 2:
+               BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_2F_X(i), 2);
+               OUT_RINGf (chan, v[0]);
+               OUT_RINGf (chan, v[1]);
+               break;
+       case 1:
+               BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_1F(i), 1);
+               OUT_RINGf (chan, v[0]);
+               break;
+       default:
+               assert(0);
+               break;
+       }
+}
+
+static unsigned
+init_per_instance_arrays_immd(struct nv50_context *nv50,
+                             unsigned startInstance,
+                             unsigned pos[16], unsigned step[16])
+{
+       struct nouveau_bo *bo;
+       unsigned i, b, count = 0;
+
+       for (i = 0; i < nv50->vtxelt_nr; ++i) {
+               if (!nv50->vtxelt[i].instance_divisor)
+                       continue;
+               ++count;
+               b = nv50->vtxelt[i].vertex_buffer_index;
+
+               pos[i] = nv50->vtxelt[i].src_offset +
+                       nv50->vtxbuf[b].buffer_offset +
+                       startInstance * nv50->vtxbuf[b].stride;
+               step[i] = startInstance % nv50->vtxelt[i].instance_divisor;
+
+               bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+               if (!bo->map)
+                       nouveau_bo_map(bo, NOUVEAU_BO_RD);
+
+               nv50_set_static_vtxattr(nv50, i, (uint8_t *)bo->map + pos[i]);
+       }
+
+       return count;
+}
+
+static unsigned
+init_per_instance_arrays(struct nv50_context *nv50,
+                        unsigned startInstance,
+                        unsigned pos[16], unsigned step[16])
+{
+       struct nouveau_grobj *tesla = nv50->screen->tesla;
+       struct nouveau_channel *chan = tesla->channel;
+       struct nouveau_bo *bo;
+       struct nouveau_stateobj *so;
+       unsigned i, b, count = 0;
+       const uint32_t rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+       if (nv50->vbo_fifo)
+               return init_per_instance_arrays_immd(nv50, startInstance,
+                                                    pos, step);
+
+       so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 2, nv50->vtxelt_nr * 2);
+
+       for (i = 0; i < nv50->vtxelt_nr; ++i) {
+               if (!nv50->vtxelt[i].instance_divisor)
+                       continue;
+               ++count;
+               b = nv50->vtxelt[i].vertex_buffer_index;
+
+               pos[i] = nv50->vtxelt[i].src_offset +
+                       nv50->vtxbuf[b].buffer_offset +
+                       startInstance * nv50->vtxbuf[b].stride;
+
+               if (!startInstance) {
+                       step[i] = 0;
+                       continue;
+               }
+               step[i] = startInstance % nv50->vtxelt[i].instance_divisor;
+
+               bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+               so_method(so, tesla, NV50TCL_VERTEX_ARRAY_START_HIGH(i), 2);
+               so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_HIGH, 0, 0);
+               so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_LOW, 0, 0);
+       }
+
+       if (count && startInstance) {
+               so_ref (so, &nv50->state.instbuf); /* for flush notify */
+               so_emit(chan, nv50->state.instbuf);
+       }
+       so_ref (NULL, &so);
+
+       return count;
+}
+
+static void
+step_per_instance_arrays_immd(struct nv50_context *nv50,
+                             unsigned pos[16], unsigned step[16])
+{
+       struct nouveau_bo *bo;
+       unsigned i, b;
+
+       for (i = 0; i < nv50->vtxelt_nr; ++i) {
+               if (!nv50->vtxelt[i].instance_divisor)
+                       continue;
+               if (++step[i] != nv50->vtxelt[i].instance_divisor)
+                       continue;
+               b = nv50->vtxelt[i].vertex_buffer_index;
+               bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+               step[i] = 0;
+               pos[i] += nv50->vtxbuf[b].stride;
+
+               nv50_set_static_vtxattr(nv50, i, (uint8_t *)bo->map + pos[i]);
+       }
+}
+
+static void
+step_per_instance_arrays(struct nv50_context *nv50,
+                        unsigned pos[16], unsigned step[16])
+{
+       struct nouveau_grobj *tesla = nv50->screen->tesla;
+       struct nouveau_channel *chan = tesla->channel;
+       struct nouveau_bo *bo;
+       struct nouveau_stateobj *so;
+       unsigned i, b;
+       const uint32_t rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+       if (nv50->vbo_fifo) {
+               step_per_instance_arrays_immd(nv50, pos, step);
+               return;
+       }
+
+       so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 2, nv50->vtxelt_nr * 2);
+
+       for (i = 0; i < nv50->vtxelt_nr; ++i) {
+               if (!nv50->vtxelt[i].instance_divisor)
+                       continue;
+               b = nv50->vtxelt[i].vertex_buffer_index;
+
+               if (++step[i] == nv50->vtxelt[i].instance_divisor) {
+                       step[i] = 0;
+                       pos[i] += nv50->vtxbuf[b].stride;
+               }
+
+               bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+               so_method(so, tesla, NV50TCL_VERTEX_ARRAY_START_HIGH(i), 2);
+               so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_HIGH, 0, 0);
+               so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_LOW, 0, 0);
+       }
+
+       so_ref (so, &nv50->state.instbuf); /* for flush notify */
+       so_ref (NULL, &so);
+
+       so_emit(chan, nv50->state.instbuf);
+}
+
+static INLINE void
+nv50_unmap_vbufs(struct nv50_context *nv50)
+{
+        unsigned i;
+
+        for (i = 0; i < nv50->vtxbuf_nr; ++i)
+                if (nouveau_bo(nv50->vtxbuf[i].buffer)->map)
+                        nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
+}
+
+void
+nv50_draw_arrays_instanced(struct pipe_context *pipe,
+                          unsigned mode, unsigned start, unsigned count,
+                          unsigned startInstance, unsigned instanceCount)
+{
+       struct nv50_context *nv50 = nv50_context(pipe);
+       struct nouveau_channel *chan = nv50->screen->tesla->channel;
+       struct nouveau_grobj *tesla = nv50->screen->tesla;
+       unsigned i, nz_divisors;
+       unsigned step[16], pos[16];
+
+       if (!NV50_USING_LOATHED_EDGEFLAG(nv50))
+               nv50_upload_user_vbufs(nv50);
+
+       nv50_state_validate(nv50);
+
+       nz_divisors = init_per_instance_arrays(nv50, startInstance, pos, step);
+
+       BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+       OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+       OUT_RING  (chan, startInstance);
+
+       BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+       OUT_RING  (chan, nv50_prim(mode));
+
+       if (nv50->vbo_fifo)
+               nv50_push_arrays(nv50, start, count);
+       else {
+               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+               OUT_RING  (chan, start);
+               OUT_RING  (chan, count);
+       }
+       BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+       OUT_RING  (chan, 0);
+
+       for (i = 1; i < instanceCount; i++) {
+               if (nz_divisors) /* any non-zero array divisors ? */
+                       step_per_instance_arrays(nv50, pos, step);
+
+               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+               OUT_RING  (chan, nv50_prim(mode) | (1 << 28));
+
+               if (nv50->vbo_fifo)
+                       nv50_push_arrays(nv50, start, count);
+               else {
+                       BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+                       OUT_RING  (chan, start);
+                       OUT_RING  (chan, count);
+               }
+               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+               OUT_RING  (chan, 0);
+       }
+       nv50_unmap_vbufs(nv50);
+
+       so_ref(NULL, &nv50->state.instbuf);
+}
+
+void
 nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
                 unsigned count)
 {
@@ -182,7 +495,11 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
        BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
        OUT_RING  (chan, 0);
 
-       return ret;
+       nv50_unmap_vbufs(nv50);
+
+        /* XXX: not sure what to do if ret != TRUE: flush and retry?
+         */
+        assert(ret);
 }
 
 static INLINE boolean
@@ -198,7 +515,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
                return nv50_push_elements_u08(nv50, map, count);
 
        if (count & 1) {
-               BEGIN_RING(chan, tesla, 0x15e8, 1);
+               BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
                OUT_RING  (chan, map[0]);
                map++;
                count--;
@@ -208,7 +525,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
                unsigned nr = count > 2046 ? 2046 : count;
                int i;
 
-               BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+               BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, nr >> 1);
                for (i = 0; i < nr; i += 2)
                        OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -231,7 +548,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
                return nv50_push_elements_u16(nv50, map, count);
 
        if (count & 1) {
-               BEGIN_RING(chan, tesla, 0x15e8, 1);
+               BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
                OUT_RING  (chan, map[0]);
                map++;
                count--;
@@ -241,7 +558,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
                unsigned nr = count > 2046 ? 2046 : count;
                int i;
 
-               BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+               BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, nr >> 1);
                for (i = 0; i < nr; i += 2)
                        OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -266,7 +583,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
        while (count) {
                unsigned nr = count > 2047 ? 2047 : count;
 
-               BEGIN_RING(chan, tesla, 0x400015e8, nr);
+               BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U32, nr);
                OUT_RINGp (chan, map, nr);
 
                count -= nr;
@@ -275,7 +592,78 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
        return TRUE;
 }
 
-boolean
+static INLINE void
+nv50_draw_elements_inline(struct nv50_context *nv50,
+                         void *map, unsigned indexSize,
+                         unsigned start, unsigned count)
+{
+       switch (indexSize) {
+       case 1:
+               nv50_draw_elements_inline_u08(nv50, map, start, count);
+               break;
+       case 2:
+               nv50_draw_elements_inline_u16(nv50, map, start, count);
+               break;
+       case 4:
+               nv50_draw_elements_inline_u32(nv50, map, start, count);
+               break;
+       }
+}
+
+void
+nv50_draw_elements_instanced(struct pipe_context *pipe,
+                            struct pipe_buffer *indexBuffer,
+                            unsigned indexSize,
+                            unsigned mode, unsigned start, unsigned count,
+                            unsigned startInstance, unsigned instanceCount)
+{
+       struct nv50_context *nv50 = nv50_context(pipe);
+       struct nouveau_grobj *tesla = nv50->screen->tesla;
+       struct nouveau_channel *chan = tesla->channel;
+       struct pipe_screen *pscreen = pipe->screen;
+       void *map;
+       unsigned i, nz_divisors;
+       unsigned step[16], pos[16];
+
+       map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+
+       if (!NV50_USING_LOATHED_EDGEFLAG(nv50))
+               nv50_upload_user_vbufs(nv50);
+
+       nv50_state_validate(nv50);
+
+       nz_divisors = init_per_instance_arrays(nv50, startInstance, pos, step);
+
+       BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+       OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+       OUT_RING  (chan, startInstance);
+
+       BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+       OUT_RING  (chan, nv50_prim(mode));
+
+       nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
+       BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+       OUT_RING  (chan, 0);
+
+       for (i = 1; i < instanceCount; ++i) {
+               if (nz_divisors) /* any non-zero array divisors ? */
+                       step_per_instance_arrays(nv50, pos, step);
+
+               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+               OUT_RING  (chan, nv50_prim(mode) | (1 << 28));
+
+               nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
+               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+               OUT_RING  (chan, 0);
+       }
+       nv50_unmap_vbufs(nv50);
+
+       so_ref(NULL, &nv50->state.instbuf);
+}
+
+void
 nv50_draw_elements(struct pipe_context *pipe,
                   struct pipe_buffer *indexBuffer, unsigned indexSize,
                   unsigned mode, unsigned start, unsigned count)
@@ -285,7 +673,6 @@ nv50_draw_elements(struct pipe_context *pipe,
        struct nouveau_grobj *tesla = nv50->screen->tesla;
        struct pipe_screen *pscreen = pipe->screen;
        void *map;
-       boolean ret;
        
        map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
 
@@ -298,27 +685,15 @@ nv50_draw_elements(struct pipe_context *pipe,
 
        BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
        OUT_RING  (chan, nv50_prim(mode));
-       switch (indexSize) {
-       case 1:
-               ret = nv50_draw_elements_inline_u08(nv50, map, start, count);
-               break;
-       case 2:
-               ret = nv50_draw_elements_inline_u16(nv50, map, start, count);
-               break;
-       case 4:
-               ret = nv50_draw_elements_inline_u32(nv50, map, start, count);
-               break;
-       default:
-               assert(0);
-               ret = FALSE;
-               break;
-       }
+
+       nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
        BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
        OUT_RING  (chan, 0);
 
-       pipe_buffer_unmap(pscreen, indexBuffer);
+       nv50_unmap_vbufs(nv50);
 
-       return ret;
+       pipe_buffer_unmap(pscreen, indexBuffer);
 }
 
 static INLINE boolean
@@ -331,26 +706,19 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
        struct nouveau_stateobj *so;
        struct nouveau_grobj *tesla = nv50->screen->tesla;
        struct nouveau_bo *bo = nouveau_bo(vb->buffer);
-       float *v;
+       float v[4];
        int ret;
-       enum pipe_format pf = ve->src_format;
-       const struct util_format_description *desc;
-
-       desc = util_format_description(pf);
-       assert(desc);
-
-       if ((desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT) ||
-           util_format_get_component_bits(pf, UTIL_FORMAT_COLORSPACE_RGB, 0) != 32)
-               return FALSE;
 
        ret = nouveau_bo_map(bo, NOUVEAU_BO_RD);
        if (ret)
                return FALSE;
-       v = (float *)(bo->map + (vb->buffer_offset + ve->src_offset));
 
+       util_format_read_4f(ve->src_format, v, 0, (uint8_t *)bo->map +
+                           (vb->buffer_offset + ve->src_offset), 0,
+                           0, 0, 1, 1);
        so = *pso;
        if (!so)
-               *pso = so = so_new(nv50->vtxelt_nr * 5, 0);
+               *pso = so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 4, 0);
 
        switch (ve->nr_components) {
        case 4:
@@ -372,6 +740,10 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
                so_data  (so, fui(v[1]));
                break;
        case 1:
+               if (attrib == nv50->vertprog->cfg.edgeflag_in) {
+                       so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+                       so_data  (so, v[0] ? 1 : 0);
+               }
                so_method(so, tesla, NV50TCL_VTX_ATTR_1F(attrib), 1);
                so_data  (so, fui(v[0]));
                break;
@@ -401,11 +773,14 @@ nv50_vbo_validate(struct nv50_context *nv50)
                    !(nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX))
                        nv50->vbo_fifo = 0xffff;
 
+       if (NV50_USING_LOATHED_EDGEFLAG(nv50))
+               nv50->vbo_fifo = 0xffff; /* vertprog can't set edgeflag */
+
        n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr);
 
        vtxattr = NULL;
-       vtxbuf = so_new(n_ve * 7, nv50->vtxelt_nr * 4);
-       vtxfmt = so_new(n_ve + 1, 0);
+       vtxbuf = so_new(n_ve * 2, n_ve * 5, nv50->vtxelt_nr * 4);
+       vtxfmt = so_new(1, n_ve, 0);
        so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve);
 
        for (i = 0; i < nv50->vtxelt_nr; i++) {
@@ -426,17 +801,20 @@ nv50_vbo_validate(struct nv50_context *nv50)
                        nv50->vbo_fifo &= ~(1 << i);
                        continue;
                }
-               so_data(vtxfmt, hw | i);
 
                if (nv50->vbo_fifo) {
+                       so_data  (vtxfmt, hw |
+                                 (ve->instance_divisor ? (1 << 4) : i));
                        so_method(vtxbuf, tesla,
                                  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
                        so_data  (vtxbuf, 0);
                        continue;
                }
+               so_data(vtxfmt, hw | i);
 
                so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3);
-               so_data  (vtxbuf, 0x20000000 | vb->stride);
+               so_data  (vtxbuf, 0x20000000 |
+                         (ve->instance_divisor ? 0 : vb->stride));
                so_reloc (vtxbuf, bo, vb->buffer_offset +
                          ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
                          NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
@@ -445,7 +823,7 @@ nv50_vbo_validate(struct nv50_context *nv50)
                          NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 
                /* vertex array limits */
-               so_method(vtxbuf, tesla, 0x1080 + (i * 8), 2);
+               so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(i), 2);
                so_reloc (vtxbuf, bo, vb->buffer->size - 1,
                          NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
                          NOUVEAU_BO_HIGH, 0, 0);
@@ -474,11 +852,14 @@ typedef void (*pfn_push)(struct nouveau_channel *, void *);
 struct nv50_vbo_emitctx
 {
        pfn_push push[16];
-       void *map[16];
+       uint8_t *map[16];
        unsigned stride[16];
        unsigned nr_ve;
        unsigned vtx_dwords;
        unsigned vtx_max;
+
+       float edgeflag;
+       unsigned ve_edgeflag;
 };
 
 static INLINE void
@@ -509,19 +890,18 @@ nv50_map_vbufs(struct nv50_context *nv50)
 
        for (i = 0; i < nv50->vtxbuf_nr; ++i) {
                struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
-               unsigned size, delta;
+               unsigned size = vb->stride * (vb->max_index + 1) + 16;
 
                if (nouveau_bo(vb->buffer)->map)
                        continue;
 
-               size = vb->stride * (vb->max_index + 1);
-               delta = vb->buffer_offset;
-
+               size = vb->stride * (vb->max_index + 1) + 16;
+               size = MIN2(size, vb->buffer->size);
                if (!size)
-                       size = vb->buffer->size - vb->buffer_offset;
+                       size = vb->buffer->size;
 
                if (nouveau_bo_map_range(nouveau_bo(vb->buffer),
-                                        delta, size, NOUVEAU_BO_RD))
+                                        0, size, NOUVEAU_BO_RD))
                        break;
        }
 
@@ -532,16 +912,6 @@ nv50_map_vbufs(struct nv50_context *nv50)
        return FALSE;
 }
 
-static INLINE void
-nv50_unmap_vbufs(struct nv50_context *nv50)
-{
-        unsigned i;
-
-        for (i = 0; i < nv50->vtxbuf_nr; ++i)
-                if (nouveau_bo(nv50->vtxbuf[i].buffer)->map)
-                        nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
-}
-
 static void
 emit_b32_1(struct nouveau_channel *chan, void *data)
 {
@@ -622,6 +992,9 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
        if (nv50_map_vbufs(nv50) == FALSE)
                return FALSE;
 
+       emit->ve_edgeflag = nv50->vertprog->cfg.edgeflag_in;
+
+       emit->edgeflag = 0.5f;
        emit->nr_ve = 0;
        emit->vtx_dwords = 0;
 
@@ -633,18 +1006,20 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
 
                ve = &nv50->vtxelt[i];
                vb = &nv50->vtxbuf[ve->vertex_buffer_index];
-               if (!(nv50->vbo_fifo & (1 << i)))
+               if (!(nv50->vbo_fifo & (1 << i)) || ve->instance_divisor)
                        continue;
                n = emit->nr_ve++;
 
                emit->stride[n] = vb->stride;
-               emit->map[n] = nouveau_bo(vb->buffer)->map +
+               emit->map[n] = (uint8_t *)nouveau_bo(vb->buffer)->map +
+                       vb->buffer_offset +
                        (start * vb->stride + ve->src_offset);
 
                desc = util_format_description(ve->src_format);
                assert(desc);
 
-               size = util_format_get_component_bits(ve->src_format, UTIL_FORMAT_COLORSPACE_RGB, 0);
+               size = util_format_get_component_bits(
+                       ve->src_format, UTIL_FORMAT_COLORSPACE_RGB, 0);
 
                assert(ve->nr_components > 0 && ve->nr_components <= 4);
 
@@ -686,10 +1061,31 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
        }
 
        emit->vtx_max = 512 / emit->vtx_dwords;
+       if (emit->ve_edgeflag < 16)
+               emit->vtx_max = 1;
 
        return TRUE;
 }
 
+static INLINE void
+set_edgeflag(struct nouveau_channel *chan,
+            struct nouveau_grobj *tesla,
+            struct nv50_vbo_emitctx *emit, uint32_t index)
+{
+       unsigned i = emit->ve_edgeflag;
+
+       if (i < 16) {
+               float f = *((float *)(emit->map[i] + index * emit->stride[i]));
+
+               if (emit->edgeflag != f) {
+                       emit->edgeflag = f;
+
+                       BEGIN_RING(chan, tesla, 0x15e4, 1);
+                       OUT_RING  (chan, f ? 1 : 0);
+               }
+       }
+}
+
 static boolean
 nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
 {
@@ -704,13 +1100,14 @@ nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
                unsigned i, dw, nr = MIN2(count, emit.vtx_max);
                dw = nr * emit.vtx_dwords;
 
-               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+               set_edgeflag(chan, tesla, &emit, 0); /* nr will be 1 */
+
+               BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
                for (i = 0; i < nr; ++i)
                        emit_vtx_next(chan, &emit);
 
                count -= nr;
        }
-       nv50_unmap_vbufs(nv50);
 
        return TRUE;
 }
@@ -729,13 +1126,14 @@ nv50_push_elements_u32(struct nv50_context *nv50, uint32_t *map, unsigned count)
                unsigned i, dw, nr = MIN2(count, emit.vtx_max);
                dw = nr * emit.vtx_dwords;
 
-               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+               set_edgeflag(chan, tesla, &emit, *map);
+
+               BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
                for (i = 0; i < nr; ++i)
                        emit_vtx(chan, &emit, *map++);
 
                count -= nr;
        }
-       nv50_unmap_vbufs(nv50);
 
        return TRUE;
 }
@@ -754,13 +1152,14 @@ nv50_push_elements_u16(struct nv50_context *nv50, uint16_t *map, unsigned count)
                unsigned i, dw, nr = MIN2(count, emit.vtx_max);
                dw = nr * emit.vtx_dwords;
 
-               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+               set_edgeflag(chan, tesla, &emit, *map);
+
+               BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
                for (i = 0; i < nr; ++i)
                        emit_vtx(chan, &emit, *map++);
 
                count -= nr;
        }
-       nv50_unmap_vbufs(nv50);
 
        return TRUE;
 }
@@ -779,13 +1178,14 @@ nv50_push_elements_u08(struct nv50_context *nv50, uint8_t *map, unsigned count)
                unsigned i, dw, nr = MIN2(count, emit.vtx_max);
                dw = nr * emit.vtx_dwords;
 
-               BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+               set_edgeflag(chan, tesla, &emit, *map);
+
+               BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
                for (i = 0; i < nr; ++i)
                        emit_vtx(chan, &emit, *map++);
 
                count -= nr;
        }
-       nv50_unmap_vbufs(nv50);
 
        return TRUE;
 }