From e7654b22aa02636d17a88a9a5ee1eeb213d81f30 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <darktama@iinet.net.au>
Date: Tue, 6 Feb 2007 00:39:50 +1100
Subject: [PATCH] nouveau: new bufferobj code.

The old code suffered from a number of issues, the most severe being that
with the Mesa VBO merge even swtcl used the driver's bufferobj interface.
On most VBO types (or non-AGP cards) the buffer ended up in vram, and
killed swtcl performance greatly.  All bufferobj's start in system memory
now, until they get referenced as a "real" VBO.

The other big change is that only potentially "damaged" areas are
uploaded/downloaded to/from the hardware.
---
 .../drivers/dri/nouveau/nouveau_bufferobj.c   | 662 +++++++++++++-----
 .../drivers/dri/nouveau/nouveau_bufferobj.h   |  64 +-
 src/mesa/drivers/dri/nouveau/nv30_fragprog.c  |   5 +-
 3 files changed, 562 insertions(+), 169 deletions(-)

diff --git a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
index d36196aeef2..684ed7b017d 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
@@ -8,29 +8,458 @@
 #include "nouveau_object.h"
 #include "nouveau_msg.h"
 
+#define NOUVEAU_MEM_FREE(mem) do {      \
+	nouveau_mem_free(ctx, (mem));   \
+	(mem) = NULL;                   \
+} while(0)
+
 #define DEBUG(fmt,args...) do {                \
 	if (NOUVEAU_DEBUG & DEBUG_BUFFEROBJ) { \
 		fprintf(stderr, "%s: "fmt, __func__, ##args);  \
 	}                                      \
 } while(0)
 
-/* Wrapper for nouveau_mem_gpu_offset_get() that marks the bufferobj dirty
- * if the GPU modifies the data.
- */
+static GLboolean
+nouveau_bo_download_from_screen(GLcontext *ctx,	GLuint offset, GLuint size,
+						struct gl_buffer_object *bo)
+{
+	nouveauContextPtr nmesa = NOUVEAU_CONTEXT(ctx);
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	nouveau_mem *in_mem;
+
+	DEBUG("bo=%p, offset=%d, size=%d\n", bo, offset, size);
+
+	/* If there's a permanent backing store, blit directly into it */
+	if (nbo->cpu_mem) {
+		if (nbo->cpu_mem != nbo->gpu_mem) {
+			DEBUG("..cpu_mem\n");
+			nouveau_memformat_flat_emit(ctx, nbo->cpu_mem,
+						    nbo->gpu_mem,
+						    offset, offset, size);
+		}
+	} else {
+		DEBUG("..sys_mem\n");
+		in_mem = nouveau_mem_alloc(ctx, NOUVEAU_MEM_AGP, size, 0);
+		if (in_mem) {
+			DEBUG("....via AGP\n");
+			/* otherwise, try blitting to faster memory and
+			 * copying from there
+			 */
+			nouveau_memformat_flat_emit(ctx, in_mem, nbo->gpu_mem,
+							 0, offset, size);
+			nouveau_notifier_wait_nop(ctx, nmesa->syncNotifier,
+						       NvSubMemFormat);
+			_mesa_memcpy(nbo->cpu_mem_sys + offset,
+					in_mem->map, size);
+			NOUVEAU_MEM_FREE(in_mem);
+		} else {
+			DEBUG("....direct VRAM copy\n");
+			/* worst case, copy directly from vram */
+			_mesa_memcpy(nbo->cpu_mem_sys + offset,
+				     nbo->gpu_mem + offset,
+				     size);
+		}
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean
+nouveau_bo_upload_to_screen(GLcontext *ctx, GLuint offset, GLuint size,
+					    struct gl_buffer_object *bo)
+{
+	nouveauContextPtr nmesa = NOUVEAU_CONTEXT(ctx);
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	nouveau_mem *out_mem;
+
+	DEBUG("bo=%p, offset=%d, size=%d\n", bo, offset, size);
+
+	if (nbo->cpu_mem) {
+		if (nbo->cpu_mem != nbo->gpu_mem) {
+			DEBUG("..cpu_mem\n");
+			nouveau_memformat_flat_emit(ctx, nbo->gpu_mem,
+						    nbo->cpu_mem,
+						    offset, offset, size);
+		}
+	} else {
+		out_mem = nouveau_mem_alloc(ctx, NOUVEAU_MEM_AGP |
+						 NOUVEAU_MEM_MAPPED,
+						 size, 0);
+		if (out_mem) {
+			DEBUG("....via AGP\n");
+			_mesa_memcpy(out_mem->map,
+					nbo->cpu_mem_sys + offset, size);
+			nouveau_memformat_flat_emit(ctx, nbo->gpu_mem, out_mem,
+						    offset, 0, size);
+			nouveau_notifier_wait_nop(ctx, nmesa->syncNotifier,
+						       NvSubMemFormat);
+			NOUVEAU_MEM_FREE(out_mem);
+		} else {
+			DEBUG("....direct VRAM copy\n");
+			_mesa_memcpy(nbo->gpu_mem->map + offset,
+				     nbo->cpu_mem_sys + offset,
+				     size);
+		}
+	}
+
+	return GL_TRUE;
+}
+
+GLboolean
+nouveau_bo_move_in(GLcontext *ctx, struct gl_buffer_object *bo)
+{
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+
+	DEBUG("bo=%p\n", bo);
+
+	if (bo->OnCard)
+		return GL_TRUE;
+	assert(nbo->gpu_mem_flags);
+
+	nbo->gpu_mem = nouveau_mem_alloc(ctx, nbo->gpu_mem_flags |
+					      NOUVEAU_MEM_MAPPED,
+					      bo->Size, 0);
+	assert(nbo->gpu_mem);
+
+	if (nbo->cpu_mem_flags) {
+		if ((nbo->cpu_mem_flags|NOUVEAU_MEM_MAPPED) != nbo->gpu_mem->type) {
+			DEBUG("..need cpu_mem buffer\n");
+
+			nbo->cpu_mem = nouveau_mem_alloc(ctx,
+							 nbo->cpu_mem_flags |
+							 NOUVEAU_MEM_MAPPED,
+							 bo->Size, 0);
+
+			if (nbo->cpu_mem) {
+				DEBUG("....alloc ok, kill sys_mem buffer\n");
+				_mesa_memcpy(nbo->cpu_mem->map,
+					     nbo->cpu_mem_sys, bo->Size);
+				FREE(nbo->cpu_mem_sys);
+			}
+		} else {
+			DEBUG("..cpu direct access to GPU buffer\n");
+			nbo->cpu_mem = nbo->gpu_mem;
+		}
+	}
+	nouveau_bo_upload_to_screen(ctx, 0, bo->Size, bo);
+
+	bo->OnCard = GL_TRUE;
+	return GL_TRUE;
+}
+
+GLboolean
+nouveau_bo_move_out(GLcontext *ctx, struct gl_buffer_object *bo)
+{
+	nouveauContextPtr nmesa = NOUVEAU_CONTEXT(ctx);
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	GLuint nr_dirty;
+
+	DEBUG("bo=%p\n", bo);
+	if (!bo->OnCard)
+		return GL_TRUE;
+
+	nr_dirty = nouveau_bo_download_dirty(ctx, bo);
+	if (nbo->cpu_mem) {
+		if (nr_dirty && nbo->cpu_mem != nbo->gpu_mem)
+			nouveau_notifier_wait_nop(ctx, nmesa->syncNotifier,
+						       NvSubMemFormat);
+		DEBUG("..destroy cpu_mem buffer\n");
+		nbo->cpu_mem_sys = malloc(bo->Size);
+		assert(nbo->cpu_mem_sys);
+		_mesa_memcpy(nbo->cpu_mem_sys, nbo->cpu_mem->map, bo->Size);
+		if (nbo->cpu_mem == nbo->gpu_mem)
+			nbo->cpu_mem = NULL;
+		else
+			NOUVEAU_MEM_FREE(nbo->cpu_mem);
+	}
+	NOUVEAU_MEM_FREE(nbo->gpu_mem);
+
+	bo->OnCard = GL_FALSE;
+	return GL_TRUE;
+}
+
+static void
+nouveau_bo_choose_storage_method(GLcontext *ctx, GLenum usage,
+						 struct gl_buffer_object *bo)
+{
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	GLuint gpu_type = 0;
+	GLuint cpu_type = 0;
+
+	switch (usage) {
+	/* Client source, changes often, used by GL many times */
+	case GL_DYNAMIC_DRAW_ARB:
+		gpu_type = NOUVEAU_MEM_AGP | NOUVEAU_MEM_FB_ACCEPTABLE;
+		cpu_type = NOUVEAU_MEM_AGP;
+		break;
+	/* GL source, changes often, client reads many times */
+	case GL_DYNAMIC_READ_ARB:
+	/* Client source, specified once, used by GL many times */
+	case GL_STATIC_DRAW_ARB:
+	/* GL source, specified once, client reads many times */
+	case GL_STATIC_READ_ARB:
+	/* Client source, specified once, used by GL a few times */
+	case GL_STREAM_DRAW_ARB:
+	/* GL source, specified once, client reads a few times */
+	case GL_STREAM_READ_ARB:
+	/* GL source, changes often, used by GL many times*/
+	case GL_DYNAMIC_COPY_ARB:
+	/* GL source, specified once, used by GL many times */
+	case GL_STATIC_COPY_ARB:
+	/* GL source, specified once, used by GL a few times */
+	case GL_STREAM_COPY_ARB:
+		gpu_type = NOUVEAU_MEM_FB;
+		break;
+	default: 
+		assert(0);
+	}
+
+	nbo->gpu_mem_flags = gpu_type;
+	nbo->cpu_mem_flags = cpu_type;
+	nbo->usage	   = usage;
+}
+
+void
+nouveau_bo_init_storage(GLcontext *ctx,	GLuint valid_gpu_access,
+					GLsizeiptrARB size,
+					const GLvoid *data,
+					GLenum usage,
+					struct gl_buffer_object *bo)
+{
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+
+	DEBUG("bo=%p\n", bo);
+
+	/* Free up previous buffers if we can't reuse them */
+	if (nbo->usage != usage ||
+			(nbo->gpu_mem && (nbo->gpu_mem->size != size))) {
+		if (nbo->cpu_mem_sys)
+			FREE(nbo->cpu_mem_sys);
+		if (nbo->cpu_mem) {
+			if (nbo->cpu_mem != nbo->gpu_mem)
+				NOUVEAU_MEM_FREE(nbo->cpu_mem);
+			else
+				nbo->cpu_mem = NULL;
+		}
+		if (nbo->gpu_mem)
+			NOUVEAU_MEM_FREE(nbo->gpu_mem);
+
+		bo->OnCard = GL_FALSE;
+		nbo->cpu_mem_sys = calloc(1, size);
+	}
+
+	nouveau_bo_choose_storage_method(ctx, usage, bo);
+	/* Force off flags that may not be ok for a given buffer */
+	nbo->gpu_mem_flags &= valid_gpu_access;
+
+	bo->Usage  = usage;
+	bo->Size   = size;
+
+	if (data) {
+		GLvoid *map = nouveau_bo_map(ctx, GL_WRITE_ONLY_ARB, bo);
+		_mesa_memcpy(map, data, size);
+		nouveau_bo_dirty_all(ctx, GL_FALSE, bo);
+		nouveau_bo_unmap(ctx, bo);
+	}
+}
+
+void *
+nouveau_bo_map(GLcontext *ctx, GLenum access, struct gl_buffer_object *bo)
+{
+	nouveauContextPtr nmesa = NOUVEAU_CONTEXT(ctx);
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+
+	DEBUG("bo=%p, access=%s\n", bo, _mesa_lookup_enum_by_nr(access));
+
+	if (bo->OnCard && 
+		(access == GL_READ_ONLY_ARB || access == GL_READ_WRITE_ARB)) {
+		GLuint nr_dirty;
+
+		DEBUG("..on card\n");
+		nr_dirty = nouveau_bo_download_dirty(ctx, bo);
+
+		/* nouveau_bo_download_dirty won't wait unless it needs to
+		 * free a temp buffer, which isn't the case if cpu_mem is
+		 * present.
+		 */
+		if (nr_dirty && nbo->cpu_mem && nbo->cpu_mem != nbo->gpu_mem)
+			nouveau_notifier_wait_nop(ctx, nmesa->syncNotifier,
+						       NvSubMemFormat);
+	}
+
+	if (nbo->cpu_mem) {
+		DEBUG("..access via cpu_mem\n");
+		return nbo->cpu_mem->map;
+	} else {
+		DEBUG("..access via cpu_mem_sys\n");
+		return nbo->cpu_mem_sys;
+	}
+}
+
+void
+nouveau_bo_unmap(GLcontext *ctx, struct gl_buffer_object *bo)
+{
+	DEBUG("unmap bo=%p\n", bo);
+}
+
 uint32_t
-nouveau_bufferobj_gpu_ref(GLcontext *ctx, GLenum access,
-			  struct gl_buffer_object *obj)
+nouveau_bo_gpu_ref(GLcontext *ctx, struct gl_buffer_object *bo)
 {
-	nouveau_buffer_object *nbo = (nouveau_buffer_object *)obj;
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
 
-	DEBUG("obj=%p, access=%s\n", obj, _mesa_lookup_enum_by_nr(access));
+	assert(nbo->mapped == GL_FALSE);
 
-	if (access == GL_WRITE_ONLY_ARB || access == GL_READ_WRITE_ARB)
-		nbo->gpu_dirty = GL_TRUE;
+	DEBUG("gpu_ref\n");
+	
+	if (!bo->OnCard) {
+		nouveau_bo_move_in(ctx, bo);
+		bo->OnCard = GL_TRUE;
+	}
+	nouveau_bo_upload_dirty(ctx, bo);
 
 	return nouveau_mem_gpu_offset_get(ctx, nbo->gpu_mem);
 }
 
+void
+nouveau_bo_dirty_linear(GLcontext *ctx, GLboolean on_card,
+			uint32_t offset, uint32_t size,
+			struct gl_buffer_object *bo)
+{
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	nouveau_bufferobj_dirty *dirty;
+	uint32_t start = offset;
+	uint32_t end = offset + size;
+	int i;
+
+	if (nbo->cpu_mem == nbo->gpu_mem)
+		return;
+
+	dirty = on_card ? &nbo->gpu_dirty : &nbo->cpu_dirty;
+
+	DEBUG("on_card=%d, offset=%d, size=%d, bo=%p\n",
+			on_card, offset, size, bo);
+
+	for (i=0; i<dirty->nr_dirty; i++) {
+		nouveau_bufferobj_region *r = &dirty->dirty[i];
+
+		/* already dirty */
+		if (start >= r->start && end <= r->end) {
+			DEBUG("..already dirty\n");
+			return;
+		}
+
+		/* add to the end of a region */
+		if (start >= r->start && start <= r->end) {
+			if (end > r->end) {
+				DEBUG("..extend end of region\n");
+				r->end = end;
+				return;
+			}
+		}
+
+		/* add to the start of a region */
+		if (start < r->start && end >= r->end) {
+			DEBUG("..extend start of region\n");
+			r->start = start;
+			/* .. and to the end */
+			if (end > r->end) {
+				DEBUG("....and end\n");
+				r->end = end;
+			}
+			return;
+		}
+	}
+
+	/* new region */
+	DEBUG("..new dirty\n");
+	dirty->nr_dirty++;
+	dirty->dirty = realloc(dirty->dirty,
+			       sizeof(nouveau_bufferobj_region) *
+			       dirty->nr_dirty);
+	dirty->dirty[dirty->nr_dirty - 1].start = start;
+	dirty->dirty[dirty->nr_dirty - 1].end   = end;
+}
+
+void
+nouveau_bo_dirty_all(GLcontext *ctx, GLboolean on_card,
+		     struct gl_buffer_object *bo)
+{
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	nouveau_bufferobj_dirty *dirty;
+
+	dirty = on_card ? &nbo->gpu_dirty : &nbo->cpu_dirty;
+	
+	DEBUG("dirty all\n");
+	if (dirty->nr_dirty) {
+		FREE(dirty->dirty);
+		dirty->dirty    = NULL;
+		dirty->nr_dirty = 0;
+	}
+
+	nouveau_bo_dirty_linear(ctx, on_card, 0, bo->Size, bo);
+}
+
+GLuint
+nouveau_bo_upload_dirty(GLcontext *ctx, struct gl_buffer_object *bo)
+{
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	nouveau_bufferobj_dirty *dirty = &nbo->cpu_dirty;
+	GLuint nr_dirty;
+	int i;
+
+	nr_dirty = dirty->nr_dirty;
+	if (!nr_dirty) {
+		DEBUG("clean\n");
+		return nr_dirty;
+	}
+
+	for (i=0; i<nr_dirty; i++) {
+		nouveau_bufferobj_region *r = &dirty->dirty[i];
+
+		DEBUG("dirty %d: o=0x%08x, s=0x%08x\n",
+				i, r->start, r->end - r->start);
+		nouveau_bo_upload_to_screen(ctx,
+					    r->start, r->end - r->start, bo);
+	}
+
+	FREE(dirty->dirty);
+	dirty->dirty    = NULL;
+	dirty->nr_dirty = 0;
+
+	return nr_dirty;
+}
+
+GLuint
+nouveau_bo_download_dirty(GLcontext *ctx, struct gl_buffer_object *bo)
+{
+	nouveau_buffer_object *nbo = (nouveau_buffer_object *)bo;
+	nouveau_bufferobj_dirty *dirty = &nbo->gpu_dirty;
+	GLuint nr_dirty;
+	int i;
+
+	nr_dirty = dirty->nr_dirty;
+	if (nr_dirty) {
+		DEBUG("clean\n");
+		return nr_dirty;
+	}
+	
+	for (i=0; i<nr_dirty; i++) {
+		nouveau_bufferobj_region *r = &dirty->dirty[i];
+
+		DEBUG("dirty %d: o=0x%08x, s=0x%08x\n",
+				i, r->start, r->end - r->start);
+		nouveau_bo_download_from_screen(ctx,
+						r->start,
+						r->end - r->start, bo);
+	}
+
+	FREE(dirty->dirty);
+	dirty->dirty    = NULL;
+	dirty->nr_dirty = 0;
+
+	return nr_dirty;
+}
+
 static void
 nouveauBindBuffer(GLcontext *ctx, GLenum target, struct gl_buffer_object *obj)
 {
@@ -42,10 +471,11 @@ nouveauNewBufferObject(GLcontext *ctx, GLuint buffer, GLenum target)
 	nouveau_buffer_object *nbo;
 
 	nbo = CALLOC_STRUCT(nouveau_buffer_object_t);
-	DEBUG("name=0x%08x, target=%s, obj=%p\n",
-			buffer, _mesa_lookup_enum_by_nr(target), nbo);
-	_mesa_initialize_buffer_object(&nbo->mesa, buffer, target);
-	return &nbo->mesa;
+	if (nbo)
+		_mesa_initialize_buffer_object(&nbo->mesa, buffer, target);
+	DEBUG("bo=%p\n", nbo);
+
+	return nbo ? &nbo->mesa : NULL;
 }
 
 static void
@@ -53,11 +483,13 @@ nouveauDeleteBuffer(GLcontext *ctx, struct gl_buffer_object *obj)
 {
 	nouveau_buffer_object *nbo = (nouveau_buffer_object *)obj;
 
-	DEBUG("obj=%p\n", obj);
+	if (nbo->gpu_dirty.nr_dirty)
+		FREE(nbo->gpu_dirty.dirty);
+	if (nbo->cpu_dirty.nr_dirty)
+		FREE(nbo->cpu_dirty.dirty);
+	if (nbo->cpu_mem) nouveau_mem_free(ctx, nbo->cpu_mem);
+	if (nbo->gpu_mem) nouveau_mem_free(ctx, nbo->gpu_mem);
 
-	if (nbo->gpu_mem) {
-		nouveau_mem_free(ctx, nbo->gpu_mem);
-	}
 	_mesa_delete_buffer_object(ctx, obj);
 }
 
@@ -66,193 +498,105 @@ nouveauBufferData(GLcontext *ctx, GLenum target, GLsizeiptrARB size,
 		  const GLvoid *data, GLenum usage,
 		  struct gl_buffer_object *obj)
 {
-	nouveau_buffer_object *nbo = (nouveau_buffer_object *)obj;
+	GLuint gpu_flags;
 
-	DEBUG("obj=%p, target=%s, usage=%s, size=%d, data=%p\n",
-			obj,
+	DEBUG("target=%s, size=%d, data=%p, usage=%s, obj=%p\n",
 			_mesa_lookup_enum_by_nr(target),
+			(GLuint)size, data,
 			_mesa_lookup_enum_by_nr(usage),
-			(unsigned int)size,
-			data);
-
-	if (nbo->gpu_mem && nbo->gpu_mem->size != size)
-		nouveau_mem_free(ctx, nbo->gpu_mem);
-
-	/* Always have the GPU access the data from VRAM if possible.  For
-	 * some "usage" values it may be better from AGP be default?
-	 *
-	 * TODO: At some point we should drop the NOUVEAU_MEM_MAPPED flag.
-	 * TODO: Use the NOUVEAU_MEM_AGP_ACCEPTABLE flag.
-	 * TODO: What about PCI-E and shared system memory?
-	 */
-	if (!nbo->gpu_mem)
-		nbo->gpu_mem = nouveau_mem_alloc(ctx,
-						 NOUVEAU_MEM_FB |
-						 NOUVEAU_MEM_MAPPED,
-						 size,
-						 0);
-
-	if (!nbo->gpu_mem) {
-		MESSAGE("AIII bufferobj malloc failed\n");
-		return;
+			obj);
+
+	switch (target) {
+	case GL_ELEMENT_ARRAY_BUFFER_ARB:
+		gpu_flags = 0;
+		break;
+	default:
+		gpu_flags = NOUVEAU_BO_VRAM_OK | NOUVEAU_BO_AGP_OK;
+		break;
 	}
-
-	obj->Usage = usage;
-	obj->Size  = size;
-	if (!data)
-		return;
-
-	ctx->Driver.MapBuffer(ctx, target, GL_WRITE_ONLY_ARB, obj);
-	_mesa_memcpy(nbo->cpu_mem->map, data, size);
-	ctx->Driver.UnmapBuffer(ctx, target, obj);
+	nouveau_bo_init_storage(ctx, gpu_flags, size, data, usage, obj);
 }
 
-/*TODO: we don't need to DMA the entire buffer like MapBuffer does.. */
 static void
 nouveauBufferSubData(GLcontext *ctx, GLenum target, GLintptrARB offset,
 		     GLsizeiptrARB size, const GLvoid *data,
 		     struct gl_buffer_object *obj)
 {
-	DEBUG("obj=%p, target=%s, offset=0x%x, size=%d, data=%p\n",
-			obj,
+	GLvoid *out;
+
+	DEBUG("target=%s, offset=0x%x, size=%d, data=%p, obj=%p\n",
 			_mesa_lookup_enum_by_nr(target),
-			(unsigned int)offset,
-			(unsigned int)size,
-			data);
+			(GLuint)offset, (GLuint)size, data, obj);
 
-	ctx->Driver.MapBuffer(ctx, target, GL_WRITE_ONLY_ARB, obj);
-	_mesa_memcpy((GLubyte *)obj->Pointer + offset, data, size);
-	ctx->Driver.UnmapBuffer(ctx, target, obj);
+	out = nouveau_bo_map(ctx, GL_WRITE_ONLY_ARB, obj);
+	_mesa_memcpy(out + offset, data, size);
+	nouveau_bo_dirty_linear(ctx, GL_FALSE, offset, size, obj);
+	nouveau_bo_unmap(ctx, obj);
 }
 
-/*TODO: we don't need to DMA the entire buffer like MapBuffer does.. */
 static void
 nouveauGetBufferSubData(GLcontext *ctx, GLenum target, GLintptrARB offset,
 		     GLsizeiptrARB size, GLvoid *data,
 		     struct gl_buffer_object *obj)
 {
-	DEBUG("obj=%p, target=%s, offset=0x%x, size=%d, data=%p\n",
-			obj,
+	const GLvoid *in;
+
+	DEBUG("target=%s, offset=0x%x, size=%d, data=%p, obj=%p\n",
 			_mesa_lookup_enum_by_nr(target),
-			(unsigned int)offset,
-			(unsigned int)size,
-			data);
+			(GLuint)offset, (GLuint)size, data, obj);
 
-	ctx->Driver.MapBuffer(ctx, target, GL_READ_ONLY_ARB, obj);
-	_mesa_memcpy(data, (GLubyte *)obj->Pointer + offset, size);
-	ctx->Driver.UnmapBuffer(ctx, target, obj);
+	in = nouveau_bo_map(ctx, GL_READ_ONLY_ARB, obj);
+	_mesa_memcpy(data, in + offset, size);
+	nouveau_bo_unmap(ctx, obj);
 }
 
 static void *
 nouveauMapBuffer(GLcontext *ctx, GLenum target, GLenum access,
 		 struct gl_buffer_object *obj)
 {
-	nouveauContextPtr nmesa = NOUVEAU_CONTEXT(ctx);
-	nouveau_buffer_object *nbo = (nouveau_buffer_object *)obj;
-
-	DEBUG("obj=%p, target=%s, access=%s\n",
-			obj,
+	DEBUG("target=%s, access=%s, obj=%p\n",
 			_mesa_lookup_enum_by_nr(target),
-			_mesa_lookup_enum_by_nr(access));
+			_mesa_lookup_enum_by_nr(access),
+			obj
+			);
 
-	if (obj->Pointer) {
-		DEBUG("already mapped, return NULL\n");
+	/* Already mapped.. */
+	if (obj->Pointer)
 		return NULL;
-	}
 
-#ifdef ALLOW_MULTI_SUBCHANNEL
-	/* If GPU is accessing the data from VRAM, copy to faster AGP memory
-	 * before CPU access to the buffer.
+	/* Have to pass READ_WRITE here, nouveau_bo_map will only ensure that
+	 * the cpu_mem buffer is up-to-date if we ask for read access.
+	 *
+	 * However, even if the client only asks for write access, we're still
+	 * forced to reupload the entire buffer.  So, we need the cpu_mem buffer
+	 * to have the correct data all the time.
 	 */
-	if (nbo->gpu_mem->type & NOUVEAU_MEM_FB) {
-		DEBUG("Data in VRAM, copying to AGP for CPU access\n");
-
-		/* This can happen if BufferData grows the GPU-access buffer */
-		if (nbo->cpu_mem && nbo->cpu_mem->size != nbo->gpu_mem->size) {
-			nouveau_mem_free(ctx, nbo->cpu_mem);
-			nbo->cpu_mem = NULL;
-		}
-
-		if (!nbo->cpu_mem) {
-			nbo->cpu_mem = nouveau_mem_alloc(ctx,
-							 NOUVEAU_MEM_AGP |
-							 NOUVEAU_MEM_MAPPED,
-							 nbo->gpu_mem->size,
-							 0);
+	obj->Pointer = nouveau_bo_map(ctx, GL_READ_WRITE_ARB, obj);
 
-			/* Mark GPU data as modified, so it gets copied to
-			 * the new buffer */
-			nbo->gpu_dirty = GL_TRUE;
-		}
-
-		if (nbo->cpu_mem && nbo->gpu_dirty) {
-			nouveau_memformat_flat_emit(ctx, nbo->cpu_mem,
-							 nbo->gpu_mem,
-							 0, 0,
-							 nbo->gpu_mem->size);
-
-			nouveau_notifier_wait_nop(ctx,
-						  nmesa->syncNotifier,
-						  NvSubMemFormat);
-			nbo->gpu_dirty = GL_FALSE;
-		}
-
-		/* buffer isn't guaranteed to be up-to-date on the card now */
-		nbo->cpu_dirty = GL_TRUE;
-	}
-#endif
-
-	/* If the copy to AGP failed for some reason, just return a pointer
-	 * directly to vram..
+	/* The GL spec says that a client attempting to write to a bufferobj
+	 * mapped READ_ONLY object may have unpredictable results, possibly
+	 * even program termination.
+	 *
+	 * We're going to use this, and only mark the buffer as dirtied if
+	 * the client asks for write access.
 	 */
-	if (!nbo->cpu_mem) {
-		DEBUG("Returning direct pointer to VRAM\n");
-		nbo->cpu_mem   = nbo->gpu_mem;
-		nbo->cpu_dirty = GL_FALSE;
+	if (target != GL_READ_ONLY_ARB) {
+		/* We have no way of knowing what was modified by the client,
+		 * so the entire buffer gets dirtied. */
+		nouveau_bo_dirty_all(ctx, GL_FALSE, obj);
 	}
 
-	obj->Pointer = nbo->cpu_mem->map;
 	return obj->Pointer;
 }
 
 static GLboolean
 nouveauUnmapBuffer(GLcontext *ctx, GLenum target, struct gl_buffer_object *obj)
 {
-	nouveauContextPtr nmesa = NOUVEAU_CONTEXT(ctx);
-	nouveau_buffer_object *nbo = (nouveau_buffer_object *)obj;
-
-	DEBUG("obj=%p, target=%s\n", obj, _mesa_lookup_enum_by_nr(target));
+	DEBUG("target=%s, obj=%p\n", _mesa_lookup_enum_by_nr(target), obj);
 
-#ifdef ALLOW_MULTI_SUBCHANNEL
-	if (nbo->cpu_dirty && nbo->cpu_mem != nbo->gpu_mem) {
-		DEBUG("Copying potentially modified data back to GPU\n");
-
-		/* blit from GPU buffer -> CPU  buffer */
-		nouveau_memformat_flat_emit(ctx, nbo->gpu_mem, nbo->cpu_mem,
-		      			    0, 0, nbo->cpu_mem->size);
-
-		/* buffer is now up-to-date on the hardware (or rather, will
-		 * be by the time any other commands in this channel reference
-		 * the data.)
-		 */
-		nbo->cpu_dirty = GL_FALSE;
-
-		/* we can avoid this wait in some cases.. */
-		nouveau_notifier_wait_nop(ctx,
-					  nmesa->syncNotifier,
-					  NvSubMemFormat);
-
-		/* If it's likely CPU access to the buffer will occur often,
-		 * keep the cpu_mem around to avoid repeated allocs.
-		 */
-		if (obj->Usage != GL_DYNAMIC_DRAW_ARB) {
-
-			nouveau_mem_free(ctx, nbo->cpu_mem);
-			nbo->cpu_mem = NULL;
-		}
-	}
-#endif
+	assert(obj->Pointer);
 
+	nouveau_bo_unmap(ctx, obj);
 	obj->Pointer = NULL;
 	return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h
index fccc349b836..932450fd877 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h
@@ -4,24 +4,74 @@
 #include "mtypes.h"
 #include "nouveau_buffers.h"
 
+#define NOUVEAU_BO_VRAM_OK (NOUVEAU_MEM_FB | NOUVEAU_MEM_FB_ACCEPTABLE)
+#define NOUVEAU_BO_AGP_OK  (NOUVEAU_MEM_AGP | NOUVEAU_MEM_AGP_ACCEPTABLE)
+
+typedef struct nouveau_bufferobj_region_t {
+	uint32_t start;
+	uint32_t end;
+} nouveau_bufferobj_region;
+
+typedef struct nouveau_bufferobj_dirty_t {
+	nouveau_bufferobj_region *dirty;
+	int nr_dirty;
+} nouveau_bufferobj_dirty;
+
 typedef struct nouveau_buffer_object_t {
 	/* Base class, must be first */
 	struct gl_buffer_object mesa;
 
+	GLboolean		mapped;
+	GLenum			usage;
+
 	/* Memory used for GPU access to the buffer*/
+	GLuint			gpu_mem_flags;
 	nouveau_mem *		gpu_mem;
-	/* Buffer has been dirtied by the GPU */
-	GLboolean		gpu_dirty;
+	nouveau_bufferobj_dirty	gpu_dirty;
 
 	/* Memory used for CPU access to the buffer */
+	GLuint			cpu_mem_flags;
 	nouveau_mem *		cpu_mem;
-	/* Buffer has possibly been dirtied by the CPU */
-	GLboolean		cpu_dirty;
+	GLvoid *		cpu_mem_sys;
+	nouveau_bufferobj_dirty	cpu_dirty;
 } nouveau_buffer_object;
 
-extern uint32_t nouveau_bufferobj_gpu_ref(GLcontext *ctx, GLenum access,
-      					  struct gl_buffer_object *obj);
+extern void
+nouveau_bo_init_storage(GLcontext *ctx, GLuint valid_gpu_access,
+			GLsizeiptrARB size, const GLvoid *data, GLenum usage,
+			struct gl_buffer_object *bo);
+
+extern GLboolean
+nouveau_bo_move_in(GLcontext *ctx, struct gl_buffer_object *bo);
+
+extern GLboolean
+nouveau_bo_move_out(GLcontext *ctx, struct gl_buffer_object *bo);
+
+extern void *
+nouveau_bo_map(GLcontext *ctx, GLenum usage, struct gl_buffer_object *bo);
+
+extern void
+nouveau_bo_unmap(GLcontext *ctx, struct gl_buffer_object *bo);
+
+extern uint32_t
+nouveau_bo_gpu_ref(GLcontext *ctx, struct gl_buffer_object *bo);
+
+extern void
+nouveau_bo_dirty_linear(GLcontext *ctx, GLboolean on_card,
+			uint32_t offset, uint32_t size,
+			struct gl_buffer_object *bo);
+
+extern void
+nouveau_bo_dirty_all(GLcontext *ctx, GLboolean on_card,
+		     struct gl_buffer_object *bo);
+
+extern GLuint
+nouveau_bo_upload_dirty(GLcontext *ctx, struct gl_buffer_object *bo);
+
+extern GLuint
+nouveau_bo_download_dirty(GLcontext *ctx, struct gl_buffer_object *bo);
 
-extern void nouveauInitBufferObjects(GLcontext *ctx);
+extern void
+nouveauInitBufferObjects(GLcontext *ctx);
 
 #endif
diff --git a/src/mesa/drivers/dri/nouveau/nv30_fragprog.c b/src/mesa/drivers/dri/nouveau/nv30_fragprog.c
index 02bd8014cc1..f868ec92931 100644
--- a/src/mesa/drivers/dri/nouveau/nv30_fragprog.c
+++ b/src/mesa/drivers/dri/nouveau/nv30_fragprog.c
@@ -32,14 +32,13 @@ NV30FPUploadToHW(GLcontext *ctx, nouveauShader *nvs)
 							GL_ARRAY_BUFFER_ARB);
 
    /* Should use STATIC_DRAW_ARB if shader doesn't use changable params */
-   ctx->Driver.BufferData(ctx, GL_ARRAY_BUFFER_ARB,
+   nouveau_bo_init_storage(ctx, NOUVEAU_BO_VRAM_OK,
 	 		  nvs->program_size * sizeof(uint32_t),
 			  (const GLvoid *)nvs->program,
 			  GL_DYNAMIC_DRAW_ARB,
 			  nvs->program_buffer);
 
-   offset = nouveau_bufferobj_gpu_ref(ctx, GL_READ_ONLY_ARB,
-	 			      nvs->program_buffer);
+   offset = nouveau_bo_gpu_ref(ctx, nvs->program_buffer);
 
    /* Not using state cache here, updated programs at the same address don't
     * seem to take effect unless the ACTIVE_PROGRAM method is called again.
-- 
2.30.2