From 8eb0fc430a8c1687627156a06faf5762144022f3 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sat, 7 Aug 2010 05:39:18 +0200
Subject: [PATCH] nvfx: rewrite draw code and buffer code

This is a full rewrite of the drawing and buffer management logic.

It offers a lot of improvements:
1. A copy of buffers is now always kept in system memory. This is
   necessary to allow software processing of them, which is necessary
   or improves performance in many cases.
2. Support for pushing vertices on the FIFO, with index lookup if necessary.
3. "Smart" draw code that tries to intelligently choose the cheapest
  way to draw something: whether to use inline vertices or hardware
  vertex buffer, and whether to use hardware index buffers
4. Support for all vertex formats supported by the hardware
5. Usage of translate to push vertices, supporting all formats that are
   sensible to use as vertex formats
6. Support for base vertex
7. Usage of Ben Skeggs' primitive splitter originally for nv50, allowing
   correct splitting of line loops, triangle fans, etc.
8. Support for instancing
9. Precomputation using the vertex elements CSO

Thanks to Ben Skeggs for his primitive splitter originally for nv50.

Thanks to Christoph Bumiller for his nv50 push code, that was the basis
of this work, even though I changed his code dramatically, in particular
to replace his ad-hoc vertex data emitter with translate.

The changes could also go into nv50 too, but there are substantial
differences due to the additional nv50 hardware features.
---
 src/gallium/drivers/nouveau/nouveau_class.h |   12 +-
 src/gallium/drivers/nouveau/nouveau_util.h  |   91 --
 src/gallium/drivers/nvfx/Makefile           |    1 +
 src/gallium/drivers/nvfx/nv30_fragtex.c     |    7 +-
 src/gallium/drivers/nvfx/nvfx_buffer.c      |   98 +-
 src/gallium/drivers/nvfx/nvfx_context.c     |    3 +
 src/gallium/drivers/nvfx/nvfx_context.h     |   99 +-
 src/gallium/drivers/nvfx/nvfx_draw.c        |   59 +-
 src/gallium/drivers/nvfx/nvfx_fragprog.c    |    7 +-
 src/gallium/drivers/nvfx/nvfx_push.c        |  402 ++++++++
 src/gallium/drivers/nvfx/nvfx_resource.c    |    6 -
 src/gallium/drivers/nvfx/nvfx_resource.h    |   91 +-
 src/gallium/drivers/nvfx/nvfx_screen.c      |   33 +-
 src/gallium/drivers/nvfx/nvfx_screen.h      |   13 +
 src/gallium/drivers/nvfx/nvfx_state.c       |   84 --
 src/gallium/drivers/nvfx/nvfx_state_emit.c  |   51 +-
 src/gallium/drivers/nvfx/nvfx_state_fb.c    |    5 +-
 src/gallium/drivers/nvfx/nvfx_surface.c     |   23 +-
 src/gallium/drivers/nvfx/nvfx_transfer.c    |  173 +++-
 src/gallium/drivers/nvfx/nvfx_vbo.c         | 1016 +++++++++----------
 src/gallium/drivers/nvfx/nvfx_vertprog.c    |   12 +-
 21 files changed, 1365 insertions(+), 921 deletions(-)
 delete mode 100644 src/gallium/drivers/nouveau/nouveau_util.h
 create mode 100644 src/gallium/drivers/nvfx/nvfx_push.c

diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
index 685fa00b455..14c11b278ad 100644
--- a/src/gallium/drivers/nouveau/nouveau_class.h
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -6149,6 +6149,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV34TCL_FP_REG_CONTROL_UNK1_MASK						0xffff0000
 #define   NV34TCL_FP_REG_CONTROL_UNK0_SHIFT						0
 #define   NV34TCL_FP_REG_CONTROL_UNK0_MASK						0x0000ffff
+#define  NV34TCL_EDGEFLAG_ENABLE							0x0000145c
 #define  NV34TCL_VP_CLIP_PLANES_ENABLE							0x00001478
 #define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0						(1 <<  1)
 #define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1						(1 <<  5)
@@ -6182,10 +6183,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_VTXFMT__SIZE								0x00000010
 #define   NV34TCL_VTXFMT_TYPE_SHIFT							0
 #define   NV34TCL_VTXFMT_TYPE_MASK							0x0000000f
-#define    NV34TCL_VTXFMT_TYPE_FLOAT							0x00000002
-#define    NV34TCL_VTXFMT_TYPE_HALF							0x00000003
-#define    NV34TCL_VTXFMT_TYPE_UBYTE							0x00000004
-#define    NV34TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define    NV34TCL_VTXFMT_TYPE_16_SNORM							0x00000001
+#define    NV34TCL_VTXFMT_TYPE_32_FLOAT							0x00000002
+#define    NV34TCL_VTXFMT_TYPE_16_FLOAT							0x00000003
+#define    NV34TCL_VTXFMT_TYPE_8_UNORM							0x00000004
+#define    NV34TCL_VTXFMT_TYPE_16_SSCALED							0x00000005
+#define    NV34TCL_VTXFMT_TYPE_11_11_10_SNORM							0x00000006
+#define    NV34TCL_VTXFMT_TYPE_8_USCALED							0x00000007
 #define   NV34TCL_VTXFMT_SIZE_SHIFT							4
 #define   NV34TCL_VTXFMT_SIZE_MASK							0x000000f0
 #define   NV34TCL_VTXFMT_STRIDE_SHIFT							8
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
deleted file mode 100644
index b165f7a611a..00000000000
--- a/src/gallium/drivers/nouveau/nouveau_util.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef __NOUVEAU_UTIL_H__
-#define __NOUVEAU_UTIL_H__
-
-/* Determine how many vertices can be pushed into the command stream.
- * Where the remaining space isn't large enough to represent all verices,
- * split the buffer at primitive boundaries.
- *
- * Returns a count of vertices that can be rendered, and an index to
- * restart drawing at after a flush.
- */
-static INLINE unsigned
-nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
-		   unsigned mode, unsigned start, unsigned count,
-		   unsigned *restart)
-{
-	int max, adj = 0;
-
-	max  = remaining - overhead;
-	if (max < 0)
-		return 0;
-
-	max *= vpp;
-	if (max >= count)
-		return count;
-
-	switch (mode) {
-	case PIPE_PRIM_POINTS:
-		break;
-	case PIPE_PRIM_LINES:
-		max = max & 1;
-		break;
-	case PIPE_PRIM_TRIANGLES:
-		max = max - (max % 3);
-		break;
-	case PIPE_PRIM_QUADS:
-		max = max & ~3;
-		break;
-	case PIPE_PRIM_LINE_LOOP:
-	case PIPE_PRIM_LINE_STRIP:
-		if (max < 2)
-			max = 0;
-		adj = 1;
-		break;
-	case PIPE_PRIM_POLYGON:
-	case PIPE_PRIM_TRIANGLE_STRIP:
-	case PIPE_PRIM_TRIANGLE_FAN:
-		if (max < 3)
-			max = 0;
-		adj = 2;
-		break;
-	case PIPE_PRIM_QUAD_STRIP:
-		if (max < 4)
-			max = 0;
-		adj = 3;
-		break;
-	default:
-		assert(0);
-	}
-
-	*restart = start + max - adj;
-	return max;
-}
-
-/* Integer base-2 logarithm, rounded towards zero. */
-static INLINE unsigned log2i(unsigned i)
-{
-	unsigned r = 0;
-
-	if (i & 0xffff0000) {
-		i >>= 16;
-		r += 16;
-	}
-	if (i & 0x0000ff00) {
-		i >>= 8;
-		r += 8;
-	}
-	if (i & 0x000000f0) {
-		i >>= 4;
-		r += 4;
-	}
-	if (i & 0x0000000c) {
-		i >>= 2;
-		r += 2;
-	}
-	if (i & 0x00000002) {
-		r += 1;
-	}
-	return r;
-}
-
-#endif
diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
index 2834f8984c7..6cbbad699eb 100644
--- a/src/gallium/drivers/nvfx/Makefile
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -14,6 +14,7 @@ C_SOURCES = \
 	nv30_fragtex.c \
 	nv40_fragtex.c \
 	nvfx_miptree.c \
+	nvfx_push.c \
 	nvfx_query.c \
 	nvfx_resource.c \
 	nvfx_screen.c \
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
index 63c578a0ce1..db8a8fc4b08 100644
--- a/src/gallium/drivers/nvfx/nv30_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -1,7 +1,6 @@
 #include "util/u_format.h"
 
 #include "nvfx_context.h"
-#include "nouveau/nouveau_util.h"
 #include "nvfx_tex.h"
 #include "nvfx_resource.h"
 
@@ -44,9 +43,9 @@ nv30_sampler_view_init(struct pipe_context *pipe,
 
 	txf = sv->u.init_fmt;
 	txf |= (level != sv->base.last_level ? NV34TCL_TX_FORMAT_MIPMAP : 0);
-	txf |= log2i(u_minify(pt->width0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
-	txf |= log2i(u_minify(pt->height0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
-	txf |= log2i(u_minify(pt->depth0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |= util_logbase2(u_minify(pt->width0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= util_logbase2(u_minify(pt->height0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= util_logbase2(u_minify(pt->depth0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
 	txf |=  0x10000;
 
 	sv->u.nv30.fmt[0] = tf->fmt[0] | txf;
diff --git a/src/gallium/drivers/nvfx/nvfx_buffer.c b/src/gallium/drivers/nvfx/nvfx_buffer.c
index 44680e51959..89bb8570efd 100644
--- a/src/gallium/drivers/nvfx/nvfx_buffer.c
+++ b/src/gallium/drivers/nvfx/nvfx_buffer.c
@@ -6,13 +6,16 @@
 #include "nouveau/nouveau_screen.h"
 #include "nouveau/nouveau_winsys.h"
 #include "nvfx_resource.h"
+#include "nvfx_screen.h"
 
 void nvfx_buffer_destroy(struct pipe_screen *pscreen,
 				struct pipe_resource *presource)
 {
-	struct nvfx_resource *buffer = nvfx_resource(presource);
+	struct nvfx_buffer *buffer = nvfx_buffer(presource);
 
-	nouveau_screen_bo_release(pscreen, buffer->bo);
+	if(!(buffer->base.base.flags & NVFX_RESOURCE_FLAG_USER))
+		align_free(buffer->data);
+	nouveau_screen_bo_release(pscreen, buffer->base.bo);
 	FREE(buffer);
 }
 
@@ -20,31 +23,22 @@ struct pipe_resource *
 nvfx_buffer_create(struct pipe_screen *pscreen,
 		   const struct pipe_resource *template)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	buffer->base = *template;
-	buffer->base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.screen = pscreen;
+	buffer->base.base = *template;
+	buffer->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.screen = pscreen;
+	buffer->size = util_format_get_stride(template->format, template->width0);
+	buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+	buffer->data = align_malloc(buffer->size, 16);
 
-	buffer->bo = nouveau_screen_bo_new(pscreen,
-					   16,
-					   buffer->base.usage,
-					   buffer->base.bind,
-					   buffer->base.width0);
-
-	if (buffer->bo == NULL)
-		goto fail;
-
-	return &buffer->base;
-
-fail:
-	FREE(buffer);
-	return NULL;
+	return &buffer->base.base;
 }
 
 
@@ -54,29 +48,49 @@ nvfx_user_buffer_create(struct pipe_screen *pscreen,
 			unsigned bytes,
 			unsigned usage)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.flags = NVFX_RESOURCE_FLAG_LINEAR;
-	buffer->base.screen = pscreen;
-	buffer->base.format = PIPE_FORMAT_R8_UNORM;
-	buffer->base.usage = PIPE_USAGE_IMMUTABLE;
-	buffer->base.bind = usage;
-	buffer->base.width0 = bytes;
-	buffer->base.height0 = 1;
-	buffer->base.depth0 = 1;
-
-	buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes);
-	if (!buffer->bo)
-		goto fail;
-
-	return &buffer->base;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.flags = NVFX_RESOURCE_FLAG_LINEAR | NVFX_RESOURCE_FLAG_USER;
+	buffer->base.base.screen = pscreen;
+	buffer->base.base.format = PIPE_FORMAT_R8_UNORM;
+	buffer->base.base.usage = PIPE_USAGE_IMMUTABLE;
+	buffer->base.base.bind = usage;
+	buffer->base.base.width0 = bytes;
+	buffer->base.base.height0 = 1;
+	buffer->base.base.depth0 = 1;
+	buffer->data = ptr;
+	buffer->size = bytes;
+	buffer->bytes_to_draw_until_static = bytes * screen->static_reuse_threshold;
+	buffer->dirty_end = bytes;
+
+	return &buffer->base.base;
+}
 
-fail:
-	FREE(buffer);
-	return NULL;
+void nvfx_buffer_upload(struct nvfx_buffer* buffer)
+{
+	unsigned dirty = buffer->dirty_end - buffer->dirty_begin;
+	if(!buffer->base.bo)
+	{
+		buffer->base.bo = nouveau_screen_bo_new(buffer->base.base.screen,
+					   16,
+					   buffer->base.base.usage,
+					   buffer->base.base.bind,
+					   buffer->base.base.width0);
+	}
+
+	if(dirty)
+	{
+		// TODO: may want to use a temporary in some cases
+		nouveau_bo_map(buffer->base.bo, NOUVEAU_BO_WR
+				| (buffer->dirty_unsynchronized ? NOUVEAU_BO_NOSYNC : 0));
+		memcpy(buffer->base.bo->map + buffer->dirty_begin, buffer->data + buffer->dirty_begin, dirty);
+		nouveau_bo_unmap(buffer->base.bo);
+		buffer->dirty_begin = buffer->dirty_end = 0;
+	}
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 1980176b23e..94c854b22b8 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -76,7 +76,9 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 	nvfx_init_surface_functions(nvfx);
 	nvfx_init_state_functions(nvfx);
 	nvfx_init_sampling_functions(nvfx);
+	nvfx_init_vbo_functions(nvfx);
 	nvfx_init_resource_functions(&nvfx->pipe);
+	nvfx_init_transfer_functions(&nvfx->pipe);
 
 	/* Create, configure, and install fallback swtnl path */
 	nvfx->draw = draw_create(&nvfx->pipe);
@@ -89,6 +91,7 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 	/* set these to that we init them on first validation */
 	nvfx->state.scissor_enabled = ~0;
 	nvfx->state.stipple_enabled = ~0;
+	nvfx->use_vertex_buffers = -1;
 
 	LIST_INITHEAD(&nvfx->render_cache);
 
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index bce19df044d..8899bf991e1 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -44,6 +44,7 @@
 #define NVFX_NEW_SR		(1 << 13)
 #define NVFX_NEW_VERTCONST	(1 << 14)
 #define NVFX_NEW_FRAGCONST	(1 << 15)
+#define NVFX_NEW_INDEX	(1 << 16)
 
 struct nvfx_rasterizer_state {
 	struct pipe_rasterizer_state pipe;
@@ -71,9 +72,53 @@ struct nvfx_state {
 	unsigned render_temps;
 };
 
+struct nvfx_per_vertex_element {
+	unsigned idx;
+        unsigned vertex_buffer_index;
+        unsigned src_offset;
+};
+
+struct nvfx_low_frequency_element {
+	unsigned idx;
+	unsigned vertex_buffer_index;
+	unsigned src_offset;
+        void (*fetch_rgba_float)(float *dst, const uint8_t *src, unsigned i, unsigned j);
+        unsigned ncomp;
+};
+
+struct nvfx_per_instance_element {
+	struct nvfx_low_frequency_element base;
+	unsigned instance_divisor;
+};
+
+struct nvfx_per_vertex_buffer_info
+{
+	unsigned vertex_buffer_index;
+	unsigned per_vertex_size;
+};
+
 struct nvfx_vtxelt_state {
 	struct pipe_vertex_element pipe[16];
 	unsigned num_elements;
+	unsigned vtxfmt[16];
+
+	unsigned num_per_vertex_buffer_infos;
+	struct nvfx_per_vertex_buffer_info per_vertex_buffer_info[16];
+
+	unsigned num_per_vertex;
+	struct nvfx_per_vertex_element per_vertex[16];
+
+	unsigned num_per_instance;
+	struct nvfx_per_instance_element per_instance[16];
+
+	unsigned num_constant;
+	struct nvfx_low_frequency_element constant[16];
+
+	boolean needs_translate;
+	struct translate* translate;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
 };
 
 struct nvfx_render_target {
@@ -127,8 +172,6 @@ struct nvfx_context {
 	struct pipe_viewport_state viewport;
 	struct pipe_framebuffer_state framebuffer;
 	struct pipe_index_buffer idxbuf;
-	struct pipe_resource *idxbuf_buffer;
-	unsigned idxbuf_format;
 	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
 	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
 	unsigned nr_samplers;
@@ -137,8 +180,14 @@ struct nvfx_context {
 	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
 	unsigned vtxbuf_nr;
 	struct nvfx_vtxelt_state *vtxelt;
+	int base_vertex;
+	boolean use_index_buffer;
+	/* -1 = hardware input setup is outdated
+	 * 0 = hardware input setup is for inline vertices
+	 * 1 = hardware input setup is for hardware vertices
+	 */
+	int use_vertex_buffers;
 
-	unsigned vbo_bo;
 	unsigned hw_vtxelt_nr;
 	uint8_t hw_samplers;
 	uint32_t hw_txf[8];
@@ -180,11 +229,7 @@ extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers,
 
 /* nvfx_draw.c */
 extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx);
-extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-                                     struct pipe_resource *idxbuf,
-                                     unsigned ib_size, int ib_bias,
-                                     unsigned mode,
-                                     unsigned start, unsigned count);
+extern void nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info);
 extern void nvfx_vtxfmt_validate(struct nvfx_context *nvfx);
 
 /* nvfx_fb.c */
@@ -245,17 +290,53 @@ extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx);
 extern void nvfx_state_emit(struct nvfx_context *nvfx);
 
 /* nvfx_transfer.c */
-extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx);
+extern void nvfx_init_transfer_functions(struct pipe_context *pipe);
 
 /* nvfx_vbo.c */
 extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx);
 extern void nvfx_vbo_relocate(struct nvfx_context *nvfx);
+extern void nvfx_idxbuf_validate(struct nvfx_context* nvfx);
+extern void nvfx_idxbuf_relocate(struct nvfx_context* nvfx);
 extern void nvfx_draw_vbo(struct pipe_context *pipe,
                           const struct pipe_draw_info *info);
+extern void nvfx_init_vbo_functions(struct nvfx_context *nvfx);
+extern unsigned nvfx_vertex_formats[];
 
 /* nvfx_vertprog.c */
 extern boolean nvfx_vertprog_validate(struct nvfx_context *nvfx);
 extern void nvfx_vertprog_destroy(struct nvfx_context *,
 				  struct nvfx_vertex_program *);
 
+/* nvfx_push.c */
+extern void nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
+
+/* must WAIT_RING(chan, ncomp + 1) or equivalent beforehand! */
+static inline void nvfx_emit_vtx_attr(struct nouveau_channel* chan, unsigned attrib, float* v, unsigned ncomp)
+{
+	switch (ncomp) {
+	case 4:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
+		OUT_RING(chan, fui(v[0]));
+		OUT_RING(chan, fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		OUT_RING(chan,  fui(v[3]));
+		break;
+	case 3:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		break;
+	case 2:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		break;
+	case 1:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
+		OUT_RING(chan,  fui(v[0]));
+		break;
+	}
+}
+
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
index 22cff370b77..331e28418ad 100644
--- a/src/gallium/drivers/nvfx/nvfx_draw.c
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -9,6 +9,7 @@
 #include "draw/draw_pipe.h"
 
 #include "nvfx_context.h"
+#include "nvfx_resource.h"
 
 /* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
  * often at all.  Uses "quadro style" vertex submission + a fixed vertex
@@ -39,30 +40,21 @@ nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
 		unsigned idx = nvfx->swtnl.draw[i];
 		unsigned hw = nvfx->swtnl.hw[i];
 
+		WAIT_RING(chan, 5);
 		switch (nvfx->swtnl.emit[i]) {
 		case EMIT_OMIT:
 			break;
 		case EMIT_1F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1);
-			OUT_RING  (chan, fui(v->data[idx][0]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 1);
 			break;
 		case EMIT_2F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 2);
 			break;
 		case EMIT_3F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 3);
 			break;
 		case EMIT_4F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
-			OUT_RING  (chan, fui(v->data[idx][3]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 4);
 			break;
 		case 0xff:
 			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
@@ -231,15 +223,9 @@ nvfx_draw_render_stage(struct nvfx_context *nvfx)
 }
 
 void
-nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-			 struct pipe_resource *idxbuf,
-			 unsigned idxbuf_size, int idxbuf_bias,
-			 unsigned mode, unsigned start, unsigned count)
+nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
-	struct pipe_transfer *ib_transfer = NULL;
-	struct pipe_transfer *cb_transfer = NULL;
 	unsigned i;
 	void *map;
 
@@ -247,18 +233,15 @@ nvfx_draw_elements_swtnl(struct pipe_context *pipe,
 		return;
 	nvfx_state_emit(nvfx);
 
+	/* these must be passed without adding the offsets */
 	for (i = 0; i < nvfx->vtxbuf_nr; i++) {
-		map = pipe_buffer_map(pipe, nvfx->vtxbuf[i].buffer,
-                                      PIPE_TRANSFER_READ,
-				      &vb_transfer[i]);
+		map = nvfx_buffer(nvfx->vtxbuf[i].buffer)->data;
 		draw_set_mapped_vertex_buffer(nvfx->draw, i, map);
 	}
 
-	if (idxbuf) {
-		map = pipe_buffer_map(pipe, idxbuf,
-				      PIPE_TRANSFER_READ,
-				      &ib_transfer);
-		draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, idxbuf_bias, map);
+	if (info->indexed) {
+		map = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;
+		draw_set_mapped_element_buffer_range(nvfx->draw, nvfx->idxbuf.index_size, info->index_bias, info->min_index, info->max_index, map);
 	} else {
 		draw_set_mapped_element_buffer(nvfx->draw, 0, 0, NULL);
 	}
@@ -266,28 +249,14 @@ nvfx_draw_elements_swtnl(struct pipe_context *pipe,
 	if (nvfx->constbuf[PIPE_SHADER_VERTEX]) {
 		const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX];
 
-		map = pipe_buffer_map(pipe,
-				      nvfx->constbuf[PIPE_SHADER_VERTEX],
-				      PIPE_TRANSFER_READ,
-				      &cb_transfer);
+		map = nvfx_buffer(nvfx->constbuf[PIPE_SHADER_VERTEX])->data;
 		draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0,
                                                 map, nr);
 	}
 
-	draw_arrays(nvfx->draw, mode, start, count);
-
-	for (i = 0; i < nvfx->vtxbuf_nr; i++)
-		pipe_buffer_unmap(pipe, nvfx->vtxbuf[i].buffer, vb_transfer[i]);
-
-	if (idxbuf)
-		pipe_buffer_unmap(pipe, idxbuf, ib_transfer);
-
-	if (nvfx->constbuf[PIPE_SHADER_VERTEX])
-		pipe_buffer_unmap(pipe, nvfx->constbuf[PIPE_SHADER_VERTEX],
-				  cb_transfer);
+	draw_arrays_instanced(nvfx->draw, info->mode, info->start, info->count, info->start_instance, info->instance_count);
 
 	draw_flush(nvfx->draw);
-	pipe->flush(pipe, 0, NULL);
 }
 
 static INLINE void
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index ee41f03b9b8..ae4fe3aa262 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -9,6 +9,7 @@
 
 #include "nvfx_context.h"
 #include "nvfx_shader.h"
+#include "nvfx_resource.h"
 
 #define MAX_CONSTS 128
 #define MAX_IMM 32
@@ -925,10 +926,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 
 		if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
 			struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
-			// TODO: avoid using transfers, just directly the buffer
-			struct pipe_transfer* transfer;
-			// TODO: does this check make any sense, or should we do this unconditionally?
-			uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer);
+			uint32_t* map = (uint32_t*)nvfx_buffer(constbuf)->data;
 			uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
 			uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
 			int i;
@@ -942,7 +940,6 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 					nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
 				}
 			}
-			pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
 		}
 	}
 
diff --git a/src/gallium/drivers/nvfx/nvfx_push.c b/src/gallium/drivers/nvfx/nvfx_push.c
new file mode 100644
index 00000000000..52e891c6678
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_push.c
@@ -0,0 +1,402 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_split_prim.h"
+#include "translate/translate.h"
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+
+struct push_context {
+	struct nouveau_channel* chan;
+
+	void *idxbuf;
+	int32_t idxbias;
+
+	float edgeflag;
+	int edgeflag_attr;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
+
+	struct translate* translate;
+};
+
+static void
+emit_edgeflag(void *priv, boolean enabled)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+
+	OUT_RING(chan, RING_3D(NV34TCL_EDGEFLAG_ENABLE, 1));
+	OUT_RING(chan, enabled ? 1 : 0);
+}
+
+static void
+emit_vertices_lookup8(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint8_t* elts = (uint8_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts8(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup16(void *priv, unsigned start, unsigned count)
+{
+	struct push_context *ctx = priv;
+        uint16_t* elts = (uint16_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts16(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup32(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint32_t* elts = (uint32_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+
+        while(count)
+        {
+		unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+		unsigned length = push * ctx->vertex_length;
+
+		OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+		ctx->translate->run(ctx->translate, start, push, 0, ctx->chan->cur);
+		ctx->chan->cur += length;
+
+		count -= push;
+		start += push;
+        }
+}
+
+static void
+emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	unsigned nr = (vc & 0xff);
+	if (nr) {
+		OUT_RING(chan, RING_3D(reg, 1));
+		OUT_RING  (chan, ((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	nr = vc >> 8;
+	while (nr) {
+		unsigned push = nr > 2047 ? 2047 : nr;
+
+		nr -= push;
+
+		OUT_RING(chan, RING_3D_NI(reg, push));
+		while (push--) {
+			OUT_RING(chan, ((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+}
+
+static void
+emit_ib_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_INDEX_BATCH);
+}
+
+static void
+emit_vb_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_VERTEX_BATCH);
+}
+
+static INLINE void
+emit_elt8(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt16(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt32(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	while (vc) {
+		unsigned push = MIN2(vc, 2047);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
+		assert(AVAIL_RING(chan) >= push);
+		if(idxbias)
+		{
+			for(unsigned i = 0; i < push; ++i)
+				OUT_RING(chan, elts[i] + idxbias);
+		}
+		else
+			OUT_RINGp(chan, elts, push);
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+void
+nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct push_context ctx;
+	struct util_split_prim s;
+	unsigned instances_left = info->instance_count;
+	int vtx_value;
+	unsigned hw_mode = nvgl_primitive(info->mode);
+	int i;
+	struct
+	{
+		uint8_t* map;
+		unsigned step;
+	} per_instance[16];
+	unsigned p_overhead = 0
+			+ 4 /* begin/end */
+			+ 4; /* potential edgeflag enable/disable */
+
+	ctx.chan = nvfx->screen->base.channel;
+	ctx.translate = nvfx->vtxelt->translate;
+	ctx.idxbuf = NULL;
+	ctx.vertex_length = nvfx->vtxelt->vertex_length;
+	ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet;
+	ctx.edgeflag = 0.5f;
+	// TODO: figure out if we really want to handle this, and do so in that case
+	ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in;
+
+	if(!nvfx->use_vertex_buffers)
+	{
+		for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset;
+			if(info->indexed)
+				data += info->index_bias * vb->stride;
+			ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+		}
+
+		if(ctx.edgeflag_attr < 16)
+			vtx_value = -(ctx.vertex_length + 3);  /* vertex data and edgeflag header and value */
+		else
+		{
+			p_overhead += 1; /* initial vertex_data header */
+			vtx_value = -ctx.vertex_length;  /* vertex data and edgeflag header and value */
+		}
+
+		if (info->indexed) {
+			// XXX: this case and is broken and probably need a new VTX_ATTR push path
+			if (nvfx->idxbuf.index_size == 1)
+				s.emit = emit_vertices_lookup8;
+			else if (nvfx->idxbuf.index_size == 2)
+				s.emit = emit_vertices_lookup16;
+			else
+				s.emit = emit_vertices_lookup32;
+		} else
+			s.emit = emit_vertices;
+	}
+	else
+	{
+		if(!info->indexed || nvfx->use_index_buffer)
+		{
+			s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges;
+			p_overhead += 3;
+			vtx_value = 0;
+		}
+		else if (nvfx->idxbuf.index_size == 4)
+		{
+			s.emit = emit_elt32;
+			p_overhead += 1;
+			vtx_value = 8;
+		}
+		else
+		{
+			s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8;
+			p_overhead += 3;
+			vtx_value = 7;
+		}
+	}
+
+	ctx.idxbias = info->index_bias;
+	if(nvfx->use_vertex_buffers)
+		ctx.idxbias -= nvfx->base_vertex;
+
+	/* map index buffer, if present */
+	if (info->indexed && !nvfx->use_index_buffer)
+		ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;
+
+	s.priv = &ctx;
+	s.edge = emit_edgeflag;
+
+	for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+	{
+		struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+		float v[4];
+		per_instance[i].step = info->start_instance % ve->instance_divisor;
+		per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset;
+
+		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+
+		WAIT_RING(chan, 5);
+		nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+	}
+
+	/* per-instance loop */
+	while (instances_left--) {
+		int max_verts;
+		boolean done;
+
+		util_split_prim_init(&s, info->mode, info->start, info->count);
+		nvfx_state_emit(nvfx);
+		for(;;) {
+			max_verts  = AVAIL_RING(chan);
+			max_verts -= p_overhead;
+
+			/* if vtx_value < 0, each vertex is -vtx_value words long
+			 * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation)
+			 */
+			if(vtx_value < 0)
+			{
+				max_verts /= -vtx_value;
+				max_verts -= (max_verts >> 10); /* vertex data headers */
+			}
+			else
+			{
+				if(max_verts >= (1 << 23)) /* avoid overflow here */
+					max_verts = (1 << 23);
+				max_verts = (max_verts * 255) >> vtx_value;
+			}
+
+			//printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts);
+
+			if(max_verts >= 16)
+			{
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, hw_mode);
+				done = util_split_prim_next(&s, max_verts);
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, 0);
+
+				if(done)
+					break;
+			}
+
+			FIRE_RING(chan);
+			nvfx_state_emit(nvfx);
+		}
+
+		/* set data for the next instance, if any changed */
+		for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+		{
+			struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+
+			if(++per_instance[i].step == ve->instance_divisor)
+			{
+				float v[4];
+				per_instance[i].map += vb->stride;
+				per_instance[i].step = 0;
+
+				nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+				WAIT_RING(chan, 5);
+				nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+			}
+		}
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.c b/src/gallium/drivers/nvfx/nvfx_resource.c
index 1c921b47100..3a46e0a7a57 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.c
+++ b/src/gallium/drivers/nvfx/nvfx_resource.c
@@ -59,12 +59,6 @@ nvfx_resource_get_handle(struct pipe_screen *pscreen,
 void
 nvfx_init_resource_functions(struct pipe_context *pipe)
 {
-	pipe->get_transfer = nvfx_transfer_new;
-	pipe->transfer_map = nvfx_transfer_map;
-	pipe->transfer_flush_region = u_default_transfer_flush_region;
-	pipe->transfer_unmap = nvfx_transfer_unmap;
-	pipe->transfer_destroy = util_staging_transfer_destroy;
-	pipe->transfer_inline_write = u_default_transfer_inline_write;
 	pipe->is_resource_referenced = nvfx_resource_is_referenced;
 }
 
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.h b/src/gallium/drivers/nvfx/nvfx_resource.h
index ff86f6d9cb6..583be4de2ae 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.h
+++ b/src/gallium/drivers/nvfx/nvfx_resource.h
@@ -17,8 +17,23 @@ struct nvfx_resource {
 	struct nouveau_bo *bo;
 };
 
+static INLINE
+struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
+{
+	return (struct nvfx_resource *)resource;
+}
+
 #define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+#define NVFX_RESOURCE_FLAG_USER (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
+
+/* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
+static INLINE boolean
+nvfx_resource_mapped_by_gpu(struct pipe_resource *resource)
+{
+   return nvfx_resource(resource)->bo->handle;
+}
 
+/* is resource in VRAM? */
 static inline int
 nvfx_resource_on_gpu(struct pipe_resource* pr)
 {
@@ -63,12 +78,6 @@ struct nvfx_surface {
 	struct nvfx_miptree* temp;
 };
 
-static INLINE 
-struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
-{
-	return (struct nvfx_resource *)resource;
-}
-
 static INLINE struct nouveau_bo *
 nvfx_surface_buffer(struct pipe_surface *surf)
 {
@@ -106,22 +115,6 @@ nvfx_miptree_from_handle(struct pipe_screen *pscreen,
 			 const struct pipe_resource *template,
 			 struct winsys_handle *whandle);
 
-struct pipe_resource *
-nvfx_buffer_create(struct pipe_screen *pscreen,
-		   const struct pipe_resource *template);
-
-void
-nvfx_buffer_destroy(struct pipe_screen *pscreen,
-                    struct pipe_resource *presource);
-
-struct pipe_resource *
-nvfx_user_buffer_create(struct pipe_screen *screen,
-			void *ptr,
-			unsigned bytes,
-			unsigned usage);
-
-
-
 void
 nvfx_miptree_surface_del(struct pipe_surface *ps);
 
@@ -173,4 +166,58 @@ nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf);
 void
 nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf);
 
+struct nvfx_buffer
+{
+	struct nvfx_resource base;
+	uint8_t* data;
+	unsigned size;
+
+	/* the range of data not yet uploaded to the GPU bo */
+	unsigned dirty_begin;
+	unsigned dirty_end;
+
+	/* whether all transfers were unsynchronized */
+	boolean dirty_unsynchronized;
+
+	/* whether it would have been profitable to upload
+	 * the latest updated data to the GPU immediately */
+	boolean last_update_static;
+
+	/* how many bytes we need to draw before we deem
+	 * the buffer to be static
+	 */
+	long long bytes_to_draw_until_static;
+};
+
+static inline struct nvfx_buffer* nvfx_buffer(struct pipe_resource* pr)
+{
+	return (struct nvfx_buffer*)pr;
+}
+
+/* this is an heuristic to determine whether we are better off uploading the
+ * buffer to the GPU, or just continuing pushing it on the FIFO
+ */
+static inline boolean nvfx_buffer_seems_static(struct nvfx_buffer* buffer)
+{
+	return buffer->last_update_static
+		|| buffer->bytes_to_draw_until_static < 0;
+}
+
+struct pipe_resource *
+nvfx_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template);
+
+void
+nvfx_buffer_destroy(struct pipe_screen *pscreen,
+                    struct pipe_resource *presource);
+
+struct pipe_resource *
+nvfx_user_buffer_create(struct pipe_screen *screen,
+			void *ptr,
+			unsigned bytes,
+			unsigned usage);
+
+void
+nvfx_buffer_upload(struct nvfx_buffer* buffer);
+
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index a1b8361a9a4..7e3caf8d2e3 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -163,11 +163,11 @@ nvfx_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_cap param)
 }
 
 static boolean
-nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
+nvfx_screen_is_format_supported(struct pipe_screen *pscreen,
 				     enum pipe_format format,
 				     enum pipe_texture_target target,
 				     unsigned sample_count,
-				     unsigned tex_usage, unsigned geom_flags)
+				     unsigned bind, unsigned geom_flags)
 {
 	struct nvfx_screen *screen = nvfx_screen(pscreen);
 	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
@@ -175,7 +175,7 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 	 if (sample_count > 1)
 		return FALSE;
 
-	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+	if (bind & PIPE_BIND_RENDER_TARGET) {
 		switch (format) {
 		case PIPE_FORMAT_B8G8R8A8_UNORM:
 		case PIPE_FORMAT_B8G8R8X8_UNORM:
@@ -186,7 +186,7 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 		}
 	}
 
-	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
+	if (bind & PIPE_BIND_DEPTH_STENCIL) {
 		switch (format) {
 		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
 		case PIPE_FORMAT_X8Z24_UNORM:
@@ -201,7 +201,7 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 		}
 	}
 
-	if (tex_usage & PIPE_BIND_SAMPLER_VIEW) {
+	if (bind & PIPE_BIND_SAMPLER_VIEW) {
 		struct nvfx_texture_format* tf = &nvfx_texture_formats[format];
 		if(util_format_is_s3tc(format) && !util_format_s3tc_enabled)
 			return FALSE;
@@ -218,6 +218,22 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 		}
 	}
 
+	// note that we do actually support everything through translate
+	if (bind & PIPE_BIND_VERTEX_BUFFER) {
+		unsigned type = nvfx_vertex_formats[format];
+		if(!type)
+			return FALSE;
+	}
+
+	if (bind & PIPE_BIND_INDEX_BUFFER) {
+		// 8-bit indices supported, but not in hardware index buffer
+		if(format != PIPE_FORMAT_R16_USCALED && format != PIPE_FORMAT_R32_USCALED)
+			return FALSE;
+	}
+
+	if(bind & PIPE_BIND_STREAM_OUTPUT)
+		return FALSE;
+
 	return TRUE;
 }
 
@@ -387,7 +403,7 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	pscreen->destroy = nvfx_screen_destroy;
 	pscreen->get_param = nvfx_screen_get_param;
 	pscreen->get_paramf = nvfx_screen_get_paramf;
-	pscreen->is_format_supported = nvfx_screen_surface_format_supported;
+	pscreen->is_format_supported = nvfx_screen_is_format_supported;
 	pscreen->context_create = nvfx_create;
 
 	switch (dev->chipset & 0xf0) {
@@ -419,6 +435,11 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE);
+	screen->trace_draw = debug_get_bool_option("NVFX_TRACE_DRAW", FALSE);
+
+	screen->buffer_allocation_cost = debug_get_num_option("NVFX_BUFFER_ALLOCATION_COST", 16384);
+	screen->inline_cost_per_hardware_cost = atof(debug_get_option("NVFX_INLINE_COST_PER_HARDWARE_COST", "1.0"));
+	screen->static_reuse_threshold = atof(debug_get_option("NVFX_STATIC_REUSE_THRESHOLD", "2.0"));
 
 	screen->vertex_buffer_reloc_flags = nvfx_screen_get_vertex_buffer_flags(screen);
 
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h
index 4dedbe9cb40..473a1127752 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.h
+++ b/src/gallium/drivers/nvfx/nvfx_screen.h
@@ -16,6 +16,7 @@ struct nvfx_screen {
 
 	unsigned is_nv4x; /* either 0 or ~0 */
 	boolean force_swtnl;
+	boolean trace_draw;
 	unsigned vertex_buffer_reloc_flags;
 	unsigned index_buffer_reloc_flags;
 
@@ -33,6 +34,18 @@ struct nvfx_screen {
 	struct nouveau_resource *vp_data_heap;
 
 	struct nv04_2d_context* eng2d;
+
+	/* Once the amount of bytes drawn from the buffer reaches the updated size times this value,
+	 * we will assume that the buffer will be drawn an huge number of times before the
+	 * next modification
+	 */
+	float static_reuse_threshold;
+
+	/* Cost of allocating a buffer in terms of the cost of copying a byte to an hardware buffer */
+	unsigned buffer_allocation_cost;
+
+	/* inline_cost/hardware_cost conversion ration */
+	float inline_cost_per_hardware_cost;
 };
 
 static INLINE struct nvfx_screen *
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
index d459f9a8801..25d29720a85 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.c
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -441,83 +441,6 @@ nvfx_set_viewport_state(struct pipe_context *pipe,
 	nvfx->draw_dirty |= NVFX_NEW_VIEWPORT;
 }
 
-static void
-nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
-			const struct pipe_vertex_buffer *vb)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	for(unsigned i = 0; i < count; ++i)
-	{
-		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, vb[i].buffer);
-		nvfx->vtxbuf[i].buffer_offset = vb[i].buffer_offset;
-		nvfx->vtxbuf[i].max_index = vb[i].max_index;
-		nvfx->vtxbuf[i].stride = vb[i].stride;
-	}
-
-	for(unsigned i = count; i < nvfx->vtxbuf_nr; ++i)
-		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
-
-	nvfx->vtxbuf_nr = count;
-
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
-}
-
-static void
-nvfx_set_index_buffer(struct pipe_context *pipe,
-		      const struct pipe_index_buffer *ib)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	/* TODO make this more like a state */
-
-	if(ib)
-	{
-		pipe_resource_reference(&nvfx->idxbuf.buffer, ib->buffer);
-		nvfx->idxbuf.index_size = ib->index_size;
-		nvfx->idxbuf.offset = ib->offset;
-	}
-	else
-	{
-		pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
-		nvfx->idxbuf.index_size = 0;
-		nvfx->idxbuf.offset = 0;
-	}
-}
-
-static void *
-nvfx_vtxelts_state_create(struct pipe_context *pipe,
-			  unsigned num_elements,
-			  const struct pipe_vertex_element *elements)
-{
-	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
-
-	assert(num_elements < 16); /* not doing fallbacks yet */
-	cso->num_elements = num_elements;
-	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
-
-/*	nvfx_vtxelt_construct(cso);*/
-
-	return (void *)cso;
-}
-
-static void
-nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	FREE(hwcso);
-}
-
-static void
-nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->vtxelt = hwcso;
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	/*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/
-}
-
 void
 nvfx_init_state_functions(struct nvfx_context *nvfx)
 {
@@ -553,11 +476,4 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple;
 	nvfx->pipe.set_scissor_state = nvfx_set_scissor_state;
 	nvfx->pipe.set_viewport_state = nvfx_set_viewport_state;
-
-	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
-	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
-	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
-
-	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
-	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index dc70f3de870..b9d18977919 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -8,6 +8,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	unsigned dirty;
+	unsigned still_dirty = 0;
 	int all_swizzled = -1;
 	boolean flush_tex_cache = FALSE;
 
@@ -52,11 +53,19 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 				return FALSE;
 		}
 
-		if(dirty & (NVFX_NEW_ARRAYS))
+		if(dirty & NVFX_NEW_ARRAYS)
 		{
 			if(!nvfx_vbo_validate(nvfx))
 				return FALSE;
 		}
+
+		if(dirty & NVFX_NEW_INDEX)
+		{
+			if(nvfx->use_index_buffer)
+				nvfx_idxbuf_validate(nvfx);
+			else
+				still_dirty = NVFX_NEW_INDEX;
+		}
 	}
 	else
 	{
@@ -64,7 +73,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
 			nvfx_vertprog_validate(nvfx);
 
-		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG))
+		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_INDEX | NVFX_NEW_FRAGPROG))
 			nvfx_vtxfmt_validate(nvfx);
 	}
 
@@ -118,7 +127,24 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 			OUT_RING(chan, 1);
 		}
 	}
-	nvfx->dirty = 0;
+
+	nvfx->dirty = dirty & still_dirty;
+
+	unsigned render_temps = nvfx->state.render_temps;
+	if(render_temps)
+	{
+		for(int i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
+		{
+			if(render_temps & (1 << i))
+				util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.cbufs[i]),
+						(struct util_dirty_surface*)nvfx->framebuffer.cbufs[i]);
+		}
+
+		if(render_temps & 0x80)
+			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.zsbuf),
+					(struct util_dirty_surface*)nvfx->framebuffer.zsbuf);
+	}
+
 	return TRUE;
 }
 
@@ -137,21 +163,6 @@ nvfx_state_emit(struct nvfx_context *nvfx)
 	      ;
 	MARK_RING(chan, max_relocs * 2, max_relocs * 2);
 	nvfx_state_relocate(nvfx);
-
-	unsigned render_temps = nvfx->state.render_temps;
-	if(render_temps)
-	{
-		for(int i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
-		{
-			if(render_temps & (1 << i))
-				util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.cbufs[i]),
-						(struct util_dirty_surface*)nvfx->framebuffer.cbufs[i]);
-		}
-
-		if(render_temps & 0x80)
-			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.zsbuf),
-					(struct util_dirty_surface*)nvfx->framebuffer.zsbuf);
-	}
 }
 
 void
@@ -161,7 +172,11 @@ nvfx_state_relocate(struct nvfx_context *nvfx)
 	nvfx_fragtex_relocate(nvfx);
 	nvfx_fragprog_relocate(nvfx);
 	if (nvfx->render_mode == HW)
+	{
 		nvfx_vbo_relocate(nvfx);
+		if(nvfx->use_index_buffer)
+			nvfx_idxbuf_relocate(nvfx);
+	}
 }
 
 boolean
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
index 80b0f21575f..28bbd36c2e8 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_fb.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -1,6 +1,5 @@
 #include "nvfx_context.h"
 #include "nvfx_resource.h"
-#include "nouveau/nouveau_util.h"
 #include "util/u_format.h"
 
 static inline boolean
@@ -125,8 +124,8 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
 
 		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
-			(log2i(fb->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
-			(log2i(fb->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+			(util_logbase2(fb->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+			(util_logbase2(fb->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
 	} else
 		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
 
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
index 7efdd954b4b..135978ad274 100644
--- a/src/gallium/drivers/nvfx/nvfx_surface.c
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -36,7 +36,6 @@
 #include "util/u_blitter.h"
 
 #include "nouveau/nouveau_winsys.h"
-#include "nouveau/nouveau_util.h"
 #include "nouveau/nouveau_screen.h"
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
@@ -62,7 +61,7 @@ nvfx_region_set_format(struct nv04_region* rgn, enum pipe_format format)
 		break;
 	default:
 		assert(util_is_pot(bits));
-		int shift = log2i(bits) - 3;
+		int shift = util_logbase2(bits) - 3;
 		assert(shift >= 2);
 		rgn->bpps = 2;
 		shift -= 2;
@@ -365,25 +364,29 @@ nvfx_surface_copy_temp(struct pipe_context* pipe, struct pipe_surface* surf, int
 {
 	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
 	struct pipe_subresource tempsr, surfsr;
-	struct pipe_resource *idxbuf_buffer;
-	unsigned idxbuf_format;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+
+	// TODO: we really should do this validation before setting these variable in draw calls
+	unsigned use_vertex_buffers = nvfx->use_vertex_buffers;
+	boolean use_index_buffer = nvfx->use_index_buffer;
+	unsigned base_vertex = nvfx->base_vertex;
 
 	tempsr.face = 0;
 	tempsr.level = 0;
 	surfsr.face = surf->face;
 	surfsr.level = surf->level;
 
-	// TODO: do this properly, in blitter save
-	idxbuf_buffer = ((struct nvfx_context*)pipe)->idxbuf_buffer;
-	idxbuf_format = ((struct nvfx_context*)pipe)->idxbuf_format;
-
 	if(to_temp)
 		nvfx_resource_copy_region(pipe, &ns->temp->base.base, tempsr, 0, 0, 0, surf->texture, surfsr, 0, 0, surf->zslice, surf->width, surf->height);
 	else
 		nvfx_resource_copy_region(pipe, surf->texture, surfsr, 0, 0, surf->zslice, &ns->temp->base.base, tempsr, 0, 0, 0, surf->width, surf->height);
 
-	((struct nvfx_context*)pipe)->idxbuf_buffer = idxbuf_buffer;
-	((struct nvfx_context*)pipe)->idxbuf_format = idxbuf_format;
+	nvfx->use_vertex_buffers = use_vertex_buffers;
+	nvfx->use_index_buffer = use_index_buffer;
+        nvfx->base_vertex = base_vertex;
+
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
 }
 
 void
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c
index e9c3dd7e551..ca4462ef9dc 100644
--- a/src/gallium/drivers/nvfx/nvfx_transfer.c
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.c
@@ -26,25 +26,44 @@ nvfx_transfer_new(struct pipe_context *pipe,
 			  unsigned usage,
 			  const struct pipe_box *box)
 {
-	struct nvfx_staging_transfer* tx;
-	bool direct = !nvfx_resource_on_gpu(pt) && pt->flags & NVFX_RESOURCE_FLAG_LINEAR;
-
-	tx = CALLOC_STRUCT(nvfx_staging_transfer);
-	if(!tx)
-		return NULL;
-
-	util_staging_transfer_init(pipe, pt, sr, usage, box, direct, tx);
+        if((usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_DONTBLOCK)) == PIPE_TRANSFER_DONTBLOCK)
+        {
+                struct nouveau_bo* bo = ((struct nvfx_resource*)pt)->bo;
+                if(bo && nouveau_bo_busy(bo, NOUVEAU_BO_WR))
+                        return NULL;
+        }
 
 	if(pt->target == PIPE_BUFFER)
 	{
-		tx->base.base.slice_stride = tx->base.base.stride = ((struct nvfx_resource*)tx->base.staging_resource)->bo->size;
-		if(direct)
-			tx->offset = util_format_get_stride(pt->format, box->x);
-		else
-			tx->offset = 0;
+		// it would be nice if we could avoid all this ridiculous overhead...
+		struct pipe_transfer* tx;
+		struct nvfx_buffer* buffer = nvfx_buffer(pt);
+
+		tx = CALLOC_STRUCT(pipe_transfer);
+		if (!tx)
+			return NULL;
+
+		pipe_resource_reference(&tx->resource, pt);
+		tx->sr = sr;
+		tx->usage = usage;
+		tx->box = *box;
+
+		tx->slice_stride = tx->stride = util_format_get_stride(pt->format, box->width);
+		tx->data = buffer->data + util_format_get_stride(pt->format, box->x);
+
+		return tx;
 	}
 	else
 	{
+	        struct nvfx_staging_transfer* tx;
+	        bool direct = !nvfx_resource_on_gpu(pt) && pt->flags & NVFX_RESOURCE_FLAG_LINEAR;
+
+	        tx = CALLOC_STRUCT(nvfx_staging_transfer);
+	        if(!tx)
+	        	return NULL;
+
+	        util_staging_transfer_init(pipe, pt, sr, usage, box, direct, &tx->base);
+
 		if(direct)
 		{
 			tx->base.base.stride = nvfx_subresource_pitch(pt, sr.level);
@@ -66,26 +85,132 @@ nvfx_transfer_new(struct pipe_context *pipe,
 	}
 }
 
+static void nvfx_buffer_dirty_interval(struct nvfx_buffer* buffer, unsigned begin, unsigned size, boolean unsynchronized)
+{
+	struct nvfx_screen* screen = nvfx_screen(buffer->base.base.screen);
+	buffer->last_update_static = buffer->bytes_to_draw_until_static < 0;
+	if(buffer->dirty_begin == buffer->dirty_end)
+	{
+		buffer->dirty_begin = begin;
+		buffer->dirty_end = begin + size;
+		buffer->dirty_unsynchronized = unsynchronized;
+	}
+	else
+	{
+		buffer->dirty_begin = MIN2(buffer->dirty_begin, begin);
+		buffer->dirty_end = MAX2(buffer->dirty_end, begin + size);
+		buffer->dirty_unsynchronized &= unsynchronized;
+	}
+
+	if(unsynchronized)
+	{
+		// TODO: revisit this, it doesn't seem quite right
+		//printf("UNSYNC UPDATE %p %u %u\n", buffer, begin, size);
+		buffer->bytes_to_draw_until_static += size * screen->static_reuse_threshold;
+	}
+	else
+		buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+}
+
+static void nvfx_transfer_flush_region( struct pipe_context *pipe,
+				      struct pipe_transfer *ptx,
+				      const struct pipe_box *box)
+{
+	if(ptx->resource->target == PIPE_BUFFER && (ptx->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data + util_format_get_stride(buffer->base.base.format, box->x),
+				util_format_get_stride(buffer->base.base.format, box->width),
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
+}
+
+static void
+nvfx_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	if(ptx->resource->target == PIPE_BUFFER)
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		if((ptx->usage & (PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT)) == PIPE_TRANSFER_WRITE)
+			nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data,
+				ptx->stride,
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+		pipe_resource_reference(&ptx->resource, 0);
+		FREE(ptx);
+	}
+	else
+		util_staging_transfer_destroy(pipe, ptx);
+}
+
 void *
 nvfx_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
 {
-	struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
-	if(!ptx->data)
+	if(ptx->resource->target == PIPE_BUFFER)
+		return ptx->data;
+	else
 	{
-		struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
-		uint8_t *map = nouveau_screen_bo_map(pipe->screen, mt->base.bo, nouveau_screen_transfer_flags(ptx->usage));
-		ptx->data = map + tx->offset;
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		if(!ptx->data)
+		{
+			struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+			uint8_t *map = nouveau_screen_bo_map(pipe->screen, mt->base.bo, nouveau_screen_transfer_flags(ptx->usage));
+			ptx->data = map + tx->offset;
+		}
+
+		++tx->map_count;
+		return ptx->data;
 	}
-	++tx->map_count;
-	return ptx->data;
 }
 
 void
 nvfx_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
 {
-	struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+	if(ptx->resource->target != PIPE_BUFFER)
+	{
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+
+		if(!--tx->map_count)
+		{
+			nouveau_screen_bo_unmap(pipe->screen, mt->base.bo);
+			ptx->data = 0;
+		}
+	}
+}
+
+static void nvfx_transfer_inline_write( struct pipe_context *pipe,
+				      struct pipe_resource *pr,
+				      struct pipe_subresource sr,
+				      unsigned usage,
+				      const struct pipe_box *box,
+				      const void *data,
+				      unsigned stride,
+				      unsigned slice_stride)
+{
+	if(pr->target != PIPE_BUFFER)
+	{
+		u_default_transfer_inline_write(pipe, pr, sr, usage, box, data, stride, slice_stride);
+	}
+	else
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(pr);
+		unsigned begin = util_format_get_stride(pr->format, box->x);
+		unsigned size = util_format_get_stride(pr->format, box->width);
+		memcpy(buffer->data + begin, data, size);
+		nvfx_buffer_dirty_interval(buffer, begin, size,
+				!!(pr->flags & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
+}
 
-	if(!--tx->map_count)
-		nouveau_screen_bo_unmap(pipe->screen, mt->base.bo);
+void
+nvfx_init_transfer_functions(struct pipe_context *pipe)
+{
+	pipe->get_transfer = nvfx_transfer_new;
+	pipe->transfer_map = nvfx_transfer_map;
+	pipe->transfer_flush_region = nvfx_transfer_flush_region;
+	pipe->transfer_unmap = nvfx_transfer_unmap;
+	pipe->transfer_destroy = nvfx_transfer_destroy;
+	pipe->transfer_inline_write = nvfx_transfer_inline_write;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
index 4aa37938425..a6cd1256350 100644
--- a/src/gallium/drivers/nvfx/nvfx_vbo.c
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -2,6 +2,7 @@
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "translate/translate.h"
 
 #include "nvfx_context.h"
 #include "nvfx_state.h"
@@ -10,646 +11,583 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_class.h"
 #include "nouveau/nouveau_pushbuf.h"
-#include "nouveau/nouveau_util.h"
 
-static INLINE int
-nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+static inline unsigned
+util_guess_unique_indices_count(unsigned mode, unsigned indices)
 {
-	switch (pipe) {
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
-		break;
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_HALF;
-		break;
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
-		break;
-	case PIPE_FORMAT_R16_SSCALED:
-	case PIPE_FORMAT_R16G16_SSCALED:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+	/* Euler's formula gives V =
+	 * = E - F + 2 =
+	 * = F * (polygon_edges / 2 - 1) + 2 =
+	 * =  F * (polygon_edges - 2) / 2 + 2 =
+	 * =  indices * (polygon_edges - 2) / (2 * indices_per_face) + 2
+	 * =  indices * (1 / 2 - 1 / polygon_edges) + 2
+	 */
+	switch(mode)
+	{
+	case PIPE_PRIM_LINES:
+		return indices >> 1;
+	case PIPE_PRIM_TRIANGLES:
+	{
+		// avoid an expensive division by 3 using the multiplicative inverse mod 2^32
+		unsigned q;
+		unsigned inv3 = 2863311531;
+		indices >>= 1;
+		q = indices * inv3;
+		if(unlikely(q >= indices))
+		{
+			q += inv3;
+			if(q >= indices)
+				q += inv3;
+		}
+		return indices + 2;
+		//return indices / 6 + 2;
 	}
-
-	switch (pipe) {
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16_SSCALED:
-		*ncomp = 1;
-		break;
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16_SSCALED:
-		*ncomp = 2;
-		break;
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-		*ncomp = 3;
-		break;
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*ncomp = 4;
-		break;
+	// guess that indexed quads are created by successive connections, since a closed mesh seems unlikely
+	case PIPE_PRIM_QUADS:
+		return (indices >> 1) + 2;
+	//	return (indices >> 2) + 2; // if it is a closed mesh
 	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+		return indices;
 	}
-
-	return 0;
 }
 
-static boolean
-nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_resource *ib,
-		    unsigned ib_size)
+static unsigned nvfx_decide_upload_mode(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
-	unsigned type;
-
-	if (!ib) {
-		nvfx->idxbuf_buffer = NULL;
-		nvfx->idxbuf_format = 0xdeadbeef;
-		return FALSE;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+	unsigned hardware_cost = 0;
+	unsigned inline_cost = 0;
+	unsigned unique_vertices;
+	unsigned upload_mode;
+	if (info->indexed)
+		unique_vertices = util_guess_unique_indices_count(info->mode, info->count);
+	else
+		unique_vertices = info->count;
+
+	/* Here we try to figure out if we are better off writing vertex data directly on the FIFO,
+	 * or create hardware buffer objects and pointing the hardware to them.
+	 *
+	 * This is done by computing the total memcpy cost of each option, ignoring uploads
+	 * if we think that the buffer is static and thus the upload cost will be amortized over
+	 * future draw calls.
+	 *
+	 * For instance, if everything looks static, we will always create buffer objects, while if
+	 * everything is a user buffer and we are not doing indexed drawing, we never do.
+	 *
+	 * Other interesting cases are where a small user vertex buffer, but a huge user index buffer,
+	 * where we will upload the vertex buffer, so that we can use hardware index lookup, and
+	 * the opposite case, where we instead do index lookup in software to avoid uploading
+	 * a huge amount of vertex data that is not going to be used.
+	 *
+	 * Otherwise, we generally move to the GPU the after it has been pushed
+	 * NVFX_STATIC_BUFFER_MIN_REUSE_TIMES times to the GPU without having
+	 * been updated with a transfer (or just the buffer having been destroyed).
+	 *
+	 * There is no special handling for user buffers, since applications can use
+	 * OpenGL VBOs in a one-shot fashion. OpenGL 3/4 core profile forces this
+	 * by the way.
+	 *
+	 * Note that currently we don't support only putting some data on the FIFO, and
+	 * some on vertex buffers (constant and instanced data is independent from this).
+	 *
+	 * nVidia doesn't seem to do this either, even though it should be at least
+	 * doable with VTX_ATTR and possibly with VERTEX_DATA too if not indexed.
+	 */
+
+	for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+	{
+		struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		buffer->bytes_to_draw_until_static -= vbi->per_vertex_size * unique_vertices;
+		if (!nvfx_buffer_seems_static(buffer))
+		{
+			hardware_cost += buffer->dirty_end - buffer->dirty_begin;
+			if (!buffer->base.bo)
+				hardware_cost += nvfx->screen->buffer_allocation_cost;
+		}
+		inline_cost += vbi->per_vertex_size * info->count;
 	}
 
-	if (!nvfx->screen->index_buffer_reloc_flags || ib_size == 1)
-		return FALSE;
+	float best_index_cost_for_hardware_vertices_as_inline_cost = 0.0f;
+	boolean prefer_hardware_indices = FALSE;
+	unsigned index_inline_cost = 0;
+	unsigned index_hardware_cost = 0;
 
-	switch (ib_size) {
-	case 2:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
-		break;
-	case 4:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
-		break;
-	default:
-		return FALSE;
-	}
+	if (info->indexed)
+	{
+		index_inline_cost = nvfx->idxbuf.index_size * info->count;
+		if (nvfx->screen->index_buffer_reloc_flags
+			&& (nvfx->idxbuf.index_size == 2 || nvfx->idxbuf.index_size == 4)
+			&& !(nvfx->idxbuf.offset & (nvfx->idxbuf.index_size - 1)))
+		{
+			struct nvfx_buffer* buffer = nvfx_buffer(nvfx->idxbuf.buffer);
+			buffer->bytes_to_draw_until_static -= index_inline_cost;
 
-	if (ib != nvfx->idxbuf_buffer ||
-	    type != nvfx->idxbuf_format) {
-		nvfx->dirty |= NVFX_NEW_ARRAYS;
-		nvfx->idxbuf_buffer = ib;
-		nvfx->idxbuf_format = type;
-	}
+			prefer_hardware_indices = TRUE;
 
-	return TRUE;
-}
+			if (!nvfx_buffer_seems_static(buffer))
+			{
+				index_hardware_cost = buffer->dirty_end - buffer->dirty_begin;
+				if (!buffer->base.bo)
+					index_hardware_cost += nvfx->screen->buffer_allocation_cost;
+			}
 
-// type must be floating point
-static inline void
-nvfx_vbo_static_attrib(struct nvfx_context *nvfx,
-		       int attrib, struct pipe_vertex_element *ve,
-		       struct pipe_vertex_buffer *vb, unsigned ncomp)
-{
-	struct pipe_transfer *transfer;
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	void *map;
-	float *v;
-
-	map  = pipe_buffer_map(&nvfx->pipe, vb->buffer, PIPE_TRANSFER_READ, &transfer);
-	map = (uint8_t *) map + vb->buffer_offset + ve->src_offset;
-
-	v = map;
-
-	switch (ncomp) {
-	case 4:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
-		OUT_RING(chan, fui(v[0]));
-		OUT_RING(chan, fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		OUT_RING(chan,  fui(v[3]));
-		break;
-	case 3:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		break;
-	case 2:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		break;
-	case 1:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
-		OUT_RING(chan,  fui(v[0]));
-		break;
+			if ((float) index_inline_cost < (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost)
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_inline_cost;
+			}
+			else
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost;
+				prefer_hardware_indices = TRUE;
+			}
+		}
 	}
 
-	pipe_buffer_unmap(&nvfx->pipe, vb->buffer, transfer);
+	/* let's finally figure out which of the 3 paths we want to take */
+	if ((float) (inline_cost + index_inline_cost) > ((float) hardware_cost * nvfx->screen->inline_cost_per_hardware_cost + best_index_cost_for_hardware_vertices_as_inline_cost))
+		upload_mode = 1 + prefer_hardware_indices;
+	else
+		upload_mode = 0;
+
+#ifdef DEBUG
+        if (unlikely(nvfx->screen->trace_draw))
+          {
+                  fprintf(stderr, "DRAW");
+                  if (info->indexed)
+                  {
+                          fprintf(stderr, "_IDX%u", nvfx->idxbuf.index_size);
+                          if (info->index_bias)
+                                  fprintf(stderr, " biased %u", info->index_bias);
+                          fprintf(stderr, " idxrange %u -> %u", info->min_index, info->max_index);
+                  }
+                  if (info->instance_count > 1)
+                          fprintf(stderr, " %u instances from %u", info->instance_count, info->indexed);
+                  fprintf(stderr, " start %u count %u prim %u", info->start, info->count, info->mode);
+                  if (!upload_mode)
+                          fprintf(stderr, " -> inline vertex data");
+                  else if (upload_mode == 2 || !info->indexed)
+                          fprintf(stderr, " -> buffer range");
+                  else
+                          fprintf(stderr, " -> inline indices");
+                  fprintf(stderr, " [ivtx %u hvtx %u iidx %u hidx %u bidx %f] <", inline_cost, hardware_cost, index_inline_cost, index_hardware_cost, best_index_cost_for_hardware_vertices_as_inline_cost);
+                  for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+                  {
+                          struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+                          struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+                          struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+                          if (i)
+                                  fprintf(stderr, ", ");
+                          fprintf(stderr, "%p%s left %Li", buffer, buffer->last_update_static ? " static" : "", buffer->bytes_to_draw_until_static);
+                  }
+                  fprintf(stderr, ">\n");
+          }
+#endif
+
+	return upload_mode;
 }
 
-static void
-nvfx_draw_arrays(struct pipe_context *pipe,
-		 unsigned mode, unsigned start, unsigned count)
+void nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
-
-	nvfx_vbo_set_idxbuf(nvfx, NULL, 0);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe, NULL, 0, 0,
-                                           mode, start, count);
-                return;
-	}
+	unsigned upload_mode = 0;
 
-	while (count) {
-		unsigned vc, nr, avail;
+	if (!nvfx->vtxelt->needs_translate)
+		upload_mode = nvfx_decide_upload_mode(pipe, info);
 
-		nvfx_state_emit(nvfx);
+	nvfx->use_index_buffer = upload_mode > 1;
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+	if ((upload_mode > 0) != nvfx->use_vertex_buffers)
+	{
+		nvfx->use_vertex_buffers = (upload_mode > 0);
+		nvfx->dirty |= NVFX_NEW_ARRAYS;
+		nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+	}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
+	if (upload_mode > 0)
+	{
+		for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			nvfx_buffer_upload(nvfx_buffer(vb->buffer));
 		}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		if (upload_mode > 1)
+		{
+			nvfx_buffer_upload(nvfx_buffer(nvfx->idxbuf.buffer));
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_VERTEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+			if (unlikely(info->index_bias != nvfx->base_vertex))
+			{
+				nvfx->base_vertex = info->index_bias;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
+			}
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_VERTEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			if (unlikely(info->start < nvfx->base_vertex && nvfx->base_vertex))
+			{
+				nvfx->base_vertex = 0;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
 			}
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		count -= vc;
-		start = restart;
 	}
 
-	pipe->flush(pipe, 0, NULL);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx))
+		nvfx_draw_vbo_swtnl(pipe, info);
+	else
+		nvfx_push_vbo(pipe, info);
 }
 
-static INLINE void
-nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+boolean
+nvfx_vbo_validate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	int i;
+	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	while (count) {
-		uint8_t *elts = (uint8_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
+	if (!elements)
+		return TRUE;
 
-		nvfx_state_emit(nvfx);
+	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
+	for(unsigned i = 0; i < nvfx->vtxelt->num_constant; ++i)
+	{
+		struct nvfx_low_frequency_element *ve = &nvfx->vtxelt->constant[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		float v[4];
+		ve->fetch_rgba_float(v, buffer->data + vb->buffer_offset + ve->src_offset, 0, 0);
+		nvfx_emit_vtx_attr(chan, ve->idx, v, ve->ncomp);
+	}
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
 
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			if(idx != ve->idx)
+			{
+				assert(idx < ve->idx);
+				OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], ve->idx - idx);
+				idx = ve->idx;
+			}
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RING(chan, nvfx->vtxelt->vtxfmt[idx] | (vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT));
+			++idx;
 		}
+		if(idx != nvfx->vtxelt->num_elements)
+			OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], nvfx->vtxelt->num_elements - idx);
+	}
+	else
+		OUT_RINGp(chan, nvfx->vtxelt->vtxfmt, nvfx->vtxelt->num_elements);
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+	for(i = nvfx->vtxelt->num_elements; i < elements; ++i)
+		OUT_RING(chan, NV34TCL_VTXFMT_TYPE_32_FLOAT);
 
-			vc -= push;
-			elts += push;
+	if(nvfx->is_nv4x) {
+		unsigned i;
+		/* seems to be some kind of cache flushing */
+		for(i = 0; i < 3; ++i) {
+			OUT_RING(chan, RING_3D(0x1718, 1));
+			OUT_RING(chan, 0);
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		start = restart;
 	}
-}
-
-static INLINE void
-nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
-{
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint16_t *elts = (uint16_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
 
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
-
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			for(; idx < ve->idx; ++idx)
+				OUT_RING(chan, 0);
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RELOC(chan, bo,
+					vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+			++idx;
 		}
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
-
-			vc -= push;
-			elts += push;
-		}
+		for(; idx < elements; ++idx)
+			OUT_RING(chan, 0);
+	}
+	else
+	{
+		for (i = 0; i < elements; i++)
+			OUT_RING(chan, 0);
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	OUT_RING(chan, RING_3D(0x1710, 1));
+	OUT_RING(chan, 0);
 
-		start = restart;
-	}
+	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
+	return TRUE;
 }
 
-static INLINE void
-nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_vbo_relocate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint32_t *elts = (uint32_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
-
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
-
-		vc = nouveau_vbuf_split(avail, 5, 1,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
-
-		while (vc) {
-			push = MIN2(vc, 2047);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
-			OUT_RINGp    (chan, elts, push);
-
-			vc -= push;
-			elts += push;
-		}
+        if(!nvfx->use_vertex_buffers)
+                return;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
+	int i;
 
-		start = restart;
+	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
+        for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+                struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+                struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+                struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+
+                OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(ve->idx), 1),
+				vb_flags, 0, 0);
+                OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+				vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				0, NV34TCL_VTXBUF_ADDRESS_DMA1);
 	}
 }
 
 static void
-nvfx_draw_elements_inline(struct pipe_context *pipe,
-			  struct pipe_resource *ib,
-			  unsigned ib_size, int ib_bias,
-			  unsigned mode, unsigned start, unsigned count)
+nvfx_idxbuf_emit(struct nvfx_context* nvfx, unsigned ib_flags)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *transfer;
-	void *map;
-
-	map = pipe_buffer_map(pipe, ib, PIPE_TRANSFER_READ, &transfer);
-	if (!ib) {
-		NOUVEAU_ERR("failed mapping ib\n");
-		return;
-	}
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned ib_format = (nvfx->idxbuf.index_size == 2) ? NV34TCL_IDXBUF_FORMAT_TYPE_U16 : NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+	struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf.buffer)->bo;
+	ib_flags |= nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	assert(ib_bias == 0);
-
-	switch (ib_size) {
-	case 1:
-		nvfx_draw_elements_u08(nvfx, map, mode, start, count);
-		break;
-	case 2:
-		nvfx_draw_elements_u16(nvfx, map, mode, start, count);
-		break;
-	case 4:
-		nvfx_draw_elements_u32(nvfx, map, mode, start, count);
-		break;
-	default:
-		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
-		break;
-	}
+	assert(nvfx->screen->index_buffer_reloc_flags);
 
-	pipe_buffer_unmap(pipe, ib, transfer);
+	MARK_RING(chan, 3, 3);
+	if(ib_flags & NOUVEAU_BO_DUMMY)
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2), ib_flags, 0, 0);
+	else
+		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
+	OUT_RELOC(chan, bo, nvfx->idxbuf.offset + 1, ib_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
+			0, NV34TCL_IDXBUF_FORMAT_DMA1);
 }
 
-static void
-nvfx_draw_elements_vbo(struct pipe_context *pipe,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_idxbuf_validate(struct nvfx_context* nvfx)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
-
-	while (count) {
-		unsigned nr, vc, avail;
-
-		nvfx_state_emit(nvfx);
+	nvfx_idxbuf_emit(nvfx, 0);
+}
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+void
+nvfx_idxbuf_relocate(struct nvfx_context* nvfx)
+{
+	nvfx_idxbuf_emit(nvfx, NOUVEAU_BO_DUMMY);
+}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
-		}
+unsigned nvfx_vertex_formats[PIPE_FORMAT_COUNT] =
+{
+	[PIPE_FORMAT_R32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32A32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16A16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_USCALED] = NV34TCL_VTXFMT_TYPE_8_USCALED,
+	[PIPE_FORMAT_R16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16A16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16A16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+};
+
+static void *
+nvfx_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
+        struct translate_key transkey;
+        unsigned per_vertex_size[16];
+        memset(per_vertex_size, 0, sizeof(per_vertex_size));
+
+        unsigned vb_compacted_index[16];
+
+	assert(num_elements < 16); /* not doing fallbacks yet */
+
+	memcpy(cso->pipe, elements, num_elements * sizeof(elements[0]));
+	cso->num_elements = num_elements;
+	cso->needs_translate = FALSE;
+
+	transkey.nr_elements = 0;
+	transkey.output_stride = 0;
+
+	for(unsigned i = 0; i < num_elements; ++i)
+        {
+		const struct pipe_vertex_element* ve = &elements[i];
+		if(!ve->instance_divisor)
+                        per_vertex_size[ve->vertex_buffer_index] += util_format_get_stride(ve->src_format, 1);
+        }
+
+        for(unsigned i = 0; i < 16; ++i)
+        {
+                if(per_vertex_size[i])
+                {
+                        unsigned idx = cso->num_per_vertex_buffer_infos++;
+                        cso->per_vertex_buffer_info[idx].vertex_buffer_index = i;
+                        cso->per_vertex_buffer_info[idx].per_vertex_size = per_vertex_size[i];
+                        vb_compacted_index[i] = idx;
+                }
+        }
+
+	for(unsigned i = 0; i < num_elements; ++i)
+	{
+		const struct pipe_vertex_element* ve = &elements[i];
+		unsigned type = nvfx_vertex_formats[ve->src_format];
+		unsigned ncomp = util_format_get_nr_components(ve->src_format);
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		//if(ve->frequency != PIPE_ELEMENT_FREQUENCY_PER_VERTEX)
+		if(ve->instance_divisor)
+		{
+			struct nvfx_low_frequency_element* lfve;
+			cso->vtxfmt[i] = NV34TCL_VTXFMT_TYPE_32_FLOAT;
+
+			//if(ve->frequency == PIPE_ELEMENT_FREQUENCY_CONSTANT)
+			if(0)
+				lfve = &cso->constant[cso->num_constant++];
+			else
+			{
+				lfve = &cso->per_instance[cso->num_per_instance++].base;
+				((struct nvfx_per_instance_element*)lfve)->instance_divisor = ve->instance_divisor;
+			}
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_INDEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+                        lfve->idx = i;
+                        lfve->vertex_buffer_index = ve->vertex_buffer_index;
+                        lfve->src_offset = ve->src_offset;
+                        lfve->fetch_rgba_float = util_format_description(ve->src_format)->fetch_rgba_float;
+                        lfve->ncomp = ncomp;
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_INDEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			unsigned idx;
+
+			idx = cso->num_per_vertex++;
+			cso->per_vertex[idx].idx = i;
+			cso->per_vertex[idx].vertex_buffer_index = ve->vertex_buffer_index;
+			cso->per_vertex[idx].src_offset = ve->src_offset;
+
+			idx = transkey.nr_elements++;
+			transkey.element[idx].input_format = ve->src_format;
+			transkey.element[idx].input_buffer = vb_compacted_index[ve->vertex_buffer_index];
+			transkey.element[idx].input_offset = ve->src_offset;
+			transkey.element[idx].instance_divisor = 0;
+			transkey.element[idx].type = TRANSLATE_ELEMENT_NORMAL;
+			if(type)
+			{
+				transkey.element[idx].output_format = ve->src_format;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type;
+			}
+			else
+			{
+				unsigned float32[4] = {PIPE_FORMAT_R32_FLOAT, PIPE_FORMAT_R32G32_FLOAT, PIPE_FORMAT_R32G32B32_FLOAT, PIPE_FORMAT_R32G32B32A32_FLOAT};
+				transkey.element[idx].output_format = float32[ncomp - 1];
+				cso->needs_translate = TRUE;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | NV34TCL_VTXFMT_TYPE_32_FLOAT;
 			}
+			transkey.element[idx].output_offset = transkey.output_stride;
+			transkey.output_stride += (util_format_get_stride(transkey.element[idx].output_format, 1) + 3) & ~3;
 		}
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	cso->translate = translate_generic_create(&transkey);
+	cso->vertex_length = transkey.output_stride >> 2;
+	cso->max_vertices_per_packet = 2047 / cso->vertex_length;
 
-		count -= vc;
-		start = restart;
-	}
+	return (void *)cso;
 }
 
 static void
-nvfx_draw_elements(struct pipe_context *pipe,
-		   struct pipe_resource *indexBuffer,
-		   unsigned indexSize, int indexBias,
-		   unsigned mode, unsigned start, unsigned count)
+nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	boolean idxbuf;
-
-	idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe,
-		                         indexBuffer, indexSize, indexBias,
-		                         mode, start, count);
-		return;
-	}
-
-	if (idxbuf) {
-		nvfx_draw_elements_vbo(pipe, mode, start, count);
-	} else {
-		nvfx_draw_elements_inline(pipe,
-		                          indexBuffer, indexSize, indexBias,
-					  mode, start, count);
-	}
-
-	pipe->flush(pipe, 0, NULL);
+	FREE(hwcso);
 }
 
-void
-nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+static void
+nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-	if (info->indexed && nvfx->idxbuf.buffer) {
-		unsigned offset;
-
-		assert(nvfx->idxbuf.offset % nvfx->idxbuf.index_size == 0);
-		offset = nvfx->idxbuf.offset / nvfx->idxbuf.index_size;
-
-		nvfx_draw_elements(pipe,
-				   nvfx->idxbuf.buffer,
-				   nvfx->idxbuf.index_size,
-				   info->index_bias,
-				   info->mode,
-				   info->start + offset,
-				   info->count);
-	}
-	else {
-		nvfx_draw_arrays(pipe,
-				info->mode,
-				info->start,
-				info->count);
-	}
+	nvfx->vtxelt = hwcso;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
 }
 
-boolean
-nvfx_vbo_validate(struct nvfx_context *nvfx)
+static void
+nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct pipe_resource *ib = nvfx->idxbuf_buffer;
-	unsigned ib_format = nvfx->idxbuf_format;
-	int i;
-	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
-	uint32_t vtxfmt[16];
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
-
-	if (!elements)
-		return TRUE;
-
-	nvfx->vbo_bo = 0;
-
-	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
-		unsigned type, ncomp;
-
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-
-		if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
-			MARK_UNDO(chan);
-			nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS;
-			return FALSE;
-		}
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!vb->stride && type == NV34TCL_VTXFMT_TYPE_FLOAT) {
-			nvfx_vbo_static_attrib(nvfx, i, ve, vb, ncomp);
-			vtxfmt[i] = type;
-		} else {
-			vtxfmt[i] = ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
-				(ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type);
-			nvfx->vbo_bo |= (1 << i);
-		}
+	for(unsigned i = 0; i < count; ++i)
+	{
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, vb[i].buffer);
+		nvfx->vtxbuf[i].buffer_offset = vb[i].buffer_offset;
+		nvfx->vtxbuf[i].max_index = vb[i].max_index;
+		nvfx->vtxbuf[i].stride = vb[i].stride;
 	}
 
-	for(; i < elements; ++i)
-		vtxfmt[i] = NV34TCL_VTXFMT_TYPE_FLOAT;
-
-	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
-	OUT_RINGp(chan, vtxfmt, elements);
-
-	if(nvfx->is_nv4x) {
-		unsigned i;
-		/* seems to be some kind of cache flushing */
-		for(i = 0; i < 3; ++i) {
-			OUT_RING(chan, RING_3D(0x1718, 1));
-			OUT_RING(chan, 0);
-		}
-	}
+	for(unsigned i = count; i < nvfx->vtxbuf_nr; ++i)
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
 
-	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
+	nvfx->vtxbuf_nr = count;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
 
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+static void
+nvfx_set_index_buffer(struct pipe_context *pipe,
+		      const struct pipe_index_buffer *ib)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!(nvfx->vbo_bo & (1 << i)))
-			OUT_RING(chan, 0);
-		else
-		{
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo,
-				 vb->buffer_offset + ve->src_offset,
-				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-				 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
+	if(ib)
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, ib->buffer);
+		nvfx->idxbuf.index_size = ib->index_size;
+		nvfx->idxbuf.offset = ib->offset;
 	}
-
-        for (; i < elements; i++)
-		OUT_RING(chan, 0);
-
-	OUT_RING(chan, RING_3D(0x1710, 1));
-	OUT_RING(chan, 0);
-
-	if (ib) {
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
-		struct nouveau_bo* bo = nvfx_resource(ib)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
-		OUT_RELOC(chan, bo, 0, ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
-				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	else
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
+		nvfx->idxbuf.index_size = 0;
+		nvfx->idxbuf.offset = 0;
 	}
 
-	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
-	return TRUE;
+	nvfx->dirty |= NVFX_NEW_INDEX;
+	nvfx->draw_dirty |= NVFX_NEW_INDEX;
 }
 
 void
-nvfx_vbo_relocate(struct nvfx_context *nvfx)
+nvfx_init_vbo_functions(struct nvfx_context *nvfx)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-	int i;
+	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
+	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 
-	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
-	for(i = 0; i < nvfx->vtxelt->num_elements; ++i) {
-		if(nvfx->vbo_bo & (1 << i)) {
-			struct pipe_vertex_element *ve = &nvfx->vtxelt->pipe[i];
-			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(i), 1),
-					vb_flags, 0, 0);
-			OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset,
-					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
-	}
-
-	if(nvfx->idxbuf_buffer)
-	{
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-		struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf_buffer)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2),
-				ib_flags, 0, 0);
-		OUT_RELOC(chan, bo, 0,
-				ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, nvfx->idxbuf_format,
-				ib_flags | NOUVEAU_BO_OR,
-				0, NV34TCL_IDXBUF_FORMAT_DMA1);
-	}
+	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
+	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
+	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 24d9846310e..939d2b83aee 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -10,6 +10,7 @@
 
 #include "nvfx_context.h"
 #include "nvfx_state.h"
+#include "nvfx_resource.h"
 
 /* TODO (at least...):
  *  1. Indexed consts  + ARL
@@ -874,7 +875,6 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	struct nouveau_grobj *eng3d = screen->eng3d;
 	struct nvfx_vertex_program *vp;
 	struct pipe_resource *constbuf;
-	struct pipe_transfer *transfer = NULL;
 	boolean upload_code = FALSE, upload_data = FALSE;
 	int i;
 
@@ -983,11 +983,8 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	if (vp->nr_consts) {
 		float *map = NULL;
 
-		if (constbuf) {
-			map = pipe_buffer_map(pipe, constbuf,
-					      PIPE_TRANSFER_READ,
-					      &transfer);
-		}
+		if (constbuf)
+			map = nvfx_buffer(constbuf)->data;
 
 		for (i = 0; i < vp->nr_consts; i++) {
 			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
@@ -1005,9 +1002,6 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			OUT_RING  (chan, i + vp->data->start);
 			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
-
-		if (constbuf)
-			pipe_buffer_unmap(pipe, constbuf, transfer);
 	}
 
 	/* Upload vtxprog */
-- 
2.30.2