From 24a4ea003f14a96ed07fdc4e92a67610655befad Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Tue, 19 Jan 2010 18:51:10 +0100
Subject: [PATCH] nv04-nv40: new 2D: add new Gallium-independent 2D engine

This patch add a brand new nv04-nv40 2D engine module.
It should correctly implement all operations involving swizzled, and 3D-swizzled surfaces.

This code is independent from the Gallium framework and can thus be reused in the DDX and classic Mesa drivers (it's only likely to be useful in the latter, though).

Currently, surface_copy and surface_fill are broken for 3D textures, for swizzled source textures and possibly for some misaligned cases

The code is based around the new nv04_region structure, which encapsulates the information from pipe_surface needed for the 2D engine and CPU copies.
The use of nv04_region makes the code independent of the Gallium framework and allows to transform the nv04_region without clobbering the nv04_region.
The existing M2MF, blitter, and SWIZZLED_SURFACE paths have been improved and a new CPU path has been added.
There is also support to tell the caller to use the 3D engine.

The main feature of the copy/fill setup algorithm is linearization/contiguous-linearization of swizzled surfaces.
The idea of linearization is that some swizzled surfaces are laid out like linear ones (1xN, 2xN, Nx1) and can thus be used as such (e.g. useful for copying single pixels).
Also, some rectangles (e.g. the whole surface) are contiguous in memory. If both the source and destination rectangles are swizzled but contiguous, then they can be regarded as both linear: this is the idea of "contiguous linearization".
This, for instance, allows to use the 2D engine to duplicate the content of a swizzled surface to another swizzled surface, by pretending they are actually linear.
After linearization, the result may not be 64-byte aligned. Another transformation is done to enlarge the linear surface so that it becomes 64-byte aligned.
This is also used to 64-byte align swizzled texture mipmaps.

The inner loop of the CPU path is as optimized as possible without using SSE/SSE2.
Future improvements could include SSE/SSE2 support, and possibly a faster coordinate swizzling algorithm (which is however not used in the inner loop).
It may be a good idea to autogenerate swizzling code at least for all possible POT 2D texture dimensions  (less than 256), maybe for all 3D ones too (less than 4096).
Also, it woud be a very good idea to make a copy with the GPU first if the source surface is in uncached memory.
---
 src/gallium/drivers/nvfx/Makefile        |    1 +
 src/gallium/drivers/nvfx/nv04_2d.c       | 1320 ++++++++++++++++++++++
 src/gallium/drivers/nvfx/nv04_2d.h       |   87 ++
 src/gallium/drivers/nvfx/nv04_2d_loops.h |   70 ++
 4 files changed, 1478 insertions(+)
 create mode 100644 src/gallium/drivers/nvfx/nv04_2d.c
 create mode 100644 src/gallium/drivers/nvfx/nv04_2d.h
 create mode 100644 src/gallium/drivers/nvfx/nv04_2d_loops.h

diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
index c1d57ca3969..6536343e440 100644
--- a/src/gallium/drivers/nvfx/Makefile
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -5,6 +5,7 @@ LIBNAME = nvfx
 
 C_SOURCES = \
 	nv04_surface_2d.c \
+	nv04_2d.c \
 	nvfx_buffer.c \
 	nvfx_context.c \
 	nvfx_clear.c \
diff --git a/src/gallium/drivers/nvfx/nv04_2d.c b/src/gallium/drivers/nvfx/nv04_2d.c
new file mode 100644
index 00000000000..b2f2ae79e12
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d.c
@@ -0,0 +1,1320 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Ben Skeggs
+ * Copyright 2009 Younes Manton
+ * Copyright 2010 Luca Barbieri
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <nouveau/nouveau_class.h>
+#include <nouveau/nouveau_device.h>
+#include <nouveau/nouveau_pushbuf.h>
+#include <nouveau/nouveau_channel.h>
+#include <nouveau/nouveau_bo.h>
+#include <nouveau/nouveau_notifier.h>
+#include <nouveau/nouveau_grobj.h>
+#include "nv04_2d.h"
+
+/* avoid depending on Mesa/Gallium */
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) !!(x)
+#define unlikely(x) !!(x)
+#endif
+
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+struct nv04_2d_context
+{
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *sifm;
+	struct nouveau_grobj *blit;
+};
+
+static inline int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+static inline int
+util_is_pot(unsigned x)
+{
+   return (x & (x - 1)) == 0;
+}
+
+/* Integer base-2 logarithm, rounded towards zero. */
+static inline unsigned log2i(unsigned i)
+{
+	unsigned r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+//#define NV04_REGION_DEBUG
+
+// Yes, we really want to inline everything, since all the functions are used only once
+#if defined(__GNUC__) && defined(DEBUG)
+#define inline __attribute__((always_inline)) inline
+#endif
+
+static inline unsigned
+nv04_swizzle_bits_square(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+		     (x & 0x002) << 1 |
+		     (x & 0x004) << 2 |
+		     (x & 0x008) << 3 |
+		     (x & 0x010) << 4 |
+		     (x & 0x020) << 5 |
+		     (x & 0x040) << 6 |
+		     (x & 0x080) << 7 |
+		     (x & 0x100) << 8 |
+		     (x & 0x200) << 9 |
+		     (x & 0x400) << 10 |
+		     (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+		     (y & 0x002) << 2 |
+		     (y & 0x004) << 3 |
+		     (y & 0x008) << 4 |
+		     (y & 0x010) << 5 |
+		     (y & 0x020) << 6 |
+		     (y & 0x040) << 7 |
+		     (y & 0x080) << 8 |
+		     (y & 0x100) << 9 |
+		     (y & 0x200) << 10 |
+		     (y & 0x400) << 11 |
+		     (y & 0x800) << 12;
+	return v | u;
+}
+
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static inline unsigned
+nv04_swizzle_bits_2d(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	if(h <= 1)
+		return x;
+	else
+	{
+		unsigned s = MIN2(w, h);
+		unsigned m = s - 1;
+		return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+	}
+}
+
+// general 3D texture case
+static inline unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned z, unsigned w, unsigned h, unsigned d)
+{
+	if(d <= 1)
+		return nv04_swizzle_bits_2d(x, y, w, h);
+	else
+	{
+		// TODO: autogenerate code for all possible texture sizes (13 * 13 * 13 with dims <= 4096) and do a single indirect call
+		unsigned v = 0;
+		w >>= 1;
+		h >>= 1;
+		d >>= 1;
+		for(int i = 0;;)
+		{
+			int oldi = i;
+			if(likely(w))
+			{
+				v |= (x & 1) << i;
+				x >>= 1;
+				w >>= 1;
+				++i;
+			}
+
+			if(likely(h))
+			{
+				v |= (y & 1) << i;
+				y >>= 1;
+				h >>= 1;
+				++i;
+			}
+
+			if(likely(d))
+			{
+				v |= (z & 1) << i;
+				z >>= 1;
+				d >>= 1;
+				++i;
+			}
+
+			if(i == oldi)
+				break;
+		}
+		return v;
+	}
+}
+
+unsigned
+nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	if(rgn->pitch)
+		return rgn->pitch * rgn->y + (rgn->x << rgn->bpps);
+	else
+		return nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d) << rgn->bpps;
+}
+
+unsigned
+nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	if(rgn->pitch)
+		return rgn->pitch * (rgn->y + h - 1) + ((rgn->x + w) << rgn->bpps);
+	else
+		return (nv04_swizzle_bits(rgn->x + w - 1, rgn->y + h - 1, rgn->z, rgn->w, rgn->h, rgn->d) + 1) << rgn->bpps;
+}
+
+// *pitch = -1 -> use 3D swizzling for (x, y), *pitch = 0 -> use 2D swizzling, other *pitch -> use linear calculations
+// returns 2 if pixel order is 3D-swizzled and 1 if subrect is 2D-swizzled
+/* *pitch == -1 ret = 0 -> 3D swizzled subrect
+ * *pitch == 0 ret = 0 -> 2D swizzled subrect
+ * *pitch > 0 ret = 0 -> linear subrect
+ * *pitch > 0 ret = 1 -> linear subrect, but with swizzled 3D data inside
+ */
+
+static inline void
+nv04_region_print(struct nv04_region* rgn)
+{
+	fprintf(stderr, "<%i[%i]> ", rgn->bo->handle, rgn->offset);
+	if(rgn->pitch)
+		fprintf(stderr, "lin %i", rgn->pitch);
+	else
+		fprintf(stderr, "swz %ix%ix%i", rgn->w, rgn->h, rgn->d);
+	fprintf(stderr, " (%i, %i, %i)", rgn->x, rgn->y, rgn->z);
+}
+
+static inline void
+nv04_region_assert(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	unsigned end = rgn->offset + nv04_region_end(rgn, w, h);
+
+	assert(rgn->offset <= (int)rgn->bo->size);
+	assert(end <= rgn->bo->size);
+	if(!rgn->pitch) {
+		assert(util_is_pot(rgn->w));
+		assert(util_is_pot(rgn->h));
+	}
+}
+
+/* determine if region can be linearized or fake-linearized */
+static inline int
+nv04_region_is_contiguous(struct nv04_region* rgn, int w, int h)
+{
+	if(rgn->pitch)
+		return rgn->pitch == w << rgn->bpps;
+
+	// redundant, but this is the fast path for the common case
+	if(w == rgn->w && h == rgn->h && rgn->d <= 1)
+		return 1;
+
+	// must be POT
+	if((w & (w - 1)) || (h & (h - 1)))
+		return 0;
+
+	// must be aligned
+	if((rgn->x & (w - 1)) || (rgn->y & (h - 1)))
+		return 0;
+
+	if(rgn->d > 1)
+		return 0;
+
+	int surf_min = MIN2(rgn->w, rgn->h);
+	int rect_min = MIN2(w, h);
+
+	if((rect_min == surf_min) || (w == h) || (w == 2 * h))
+		return 1;
+
+	return 0;
+}
+
+// double the pitch until it is larger than the alignment, or the height becomes odd or 1
+static inline void
+nv04_region_contiguous_shape(struct nv04_region* rgn, int* w, int* h, int align)
+{
+	while(!(*h & 1) && (*w << rgn->bpps) < (1 << align))
+	{
+		*w <<= 1;
+		*h >>= 1;
+	}
+
+	while((*w << rgn->bpps) > 16384 && !(*w & 1))
+	{
+		*w >>= 1;
+		*h <<= 1;
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tCONTIGUOUS %ix%i\n", *w, *h);
+#endif
+}
+
+static inline void
+nv04_region_linearize_contiguous(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	int pos;
+	if(rgn->pitch)
+	{
+		rgn->offset += rgn->y * rgn->pitch + (rgn->x << rgn->bpps);
+		rgn->x = 0;
+		rgn->y = 0;
+	}
+	else
+	{
+		rgn->offset += (rgn->w * rgn->h * rgn->z) << rgn->bpps;
+		pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
+		rgn->x = pos & (w - 1);
+		rgn->y = pos / w;
+	}
+	rgn->pitch = w << rgn->bpps;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tLINEARIZE ");
+	nv04_region_print(rgn);
+	fprintf(stderr, "\n");
+#endif
+}
+
+	/* preserve the offset! */
+	/*
+	rgn->pitch = util_format_get_stride(rgn->format, w);
+	int pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
+	rgn->x = pos & (w - 1);
+	rgn->y = pos & ~(w - 1);
+	*/
+
+	/*
+	rgn->offset +=
+	rgn->pitch = util_format_get_stride(rgn->format, w);
+	rgn->x = 0;
+	rgn->y = 0;
+	*/
+
+/* This code will get used for, and always succeed on:
+ * - 4x2 1bpp swizzled texture mipmap levels
+ * - linear regions created by linearization
+ *
+ * This code will get used for, and MAY work for:
+ * - misaligned texture blanket
+ * - linear surfaces created without wide_pitch (in this case, it will only work if we are lucky)
+ *
+ * The general case requires splitting the region in 2.
+ */
+static inline int
+nv04_region_do_align_offset(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
+{
+	if(rgn->pitch > 0)
+	{
+		assert(!(rgn->offset & ((1 << rgn->bpps) - 1))); // fatal!
+		int delta = rgn->offset & ((1 << shift) - 1);
+
+		if(h <= 1)
+		{
+			rgn->x += delta >> rgn->bpps;
+			rgn->offset -= delta;
+			rgn->pitch = align((rgn->x + w) << rgn->bpps, 1 << shift);
+		}
+		else
+		{
+			int newxo = (rgn->x << rgn->bpps) + delta;
+			int dy = newxo / rgn->pitch;
+			newxo -= dy * rgn->pitch;
+			if((newxo + (w << rgn->bpps)) > rgn->pitch)
+			{
+				// TODO: split the region into two rectangles (!) if *really* necessary, unless the hardware actually supports "wrapping" rectangles
+				// this does not happen if the surface is pitch-aligned, which it should always be
+				assert(0);
+				return -1;
+			}
+			rgn->x = newxo >> rgn->bpps;
+			rgn->y += dy;
+		}
+	}
+	else
+	{
+		// we don't care about the alignment of 3D surfaces since the 2D engine can't use them
+		if(rgn->d < 0)
+			return -1;
+
+		int size;
+		int min = MIN2(rgn->w, rgn->h);
+		size = min * min << rgn->bpps;
+
+		// this is unfixable, and should not be happening
+		if(rgn->offset & (size - 1))
+			return -1;
+
+		int v = (rgn->offset & ((1 << shift) - 1)) / size;
+		rgn->offset -= v * size;
+
+		if(rgn->h == min)
+		{
+			unsigned w;
+			rgn->x += rgn->h * v;
+			w = rgn->w + rgn->h * v;
+
+			while(rgn->w < w)
+				rgn->w += rgn->w;
+		}
+		else
+		{
+			unsigned h;
+			rgn->y += rgn->w * v;
+			h = rgn->h + rgn->w * v;
+
+			while(rgn->h < h)
+				rgn->h += rgn->h;
+		}
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tALIGNED ");
+	nv04_region_print(rgn);
+	fprintf(stderr, "\n");
+#endif
+	return 0;
+}
+
+// both pitch and shift
+// will leave the region unchanged if it fails
+static inline int
+nv04_region_align(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
+{
+	if(rgn->pitch & ((1 << shift) - 1))
+	{
+		if(h == 1)
+			goto do_align; /* this will fix pitch too in this case */
+		else
+			return -1;
+	}
+
+	if(rgn->offset & ((1 << shift) - 1))
+	{
+		do_align:
+		if(nv04_region_do_align_offset(rgn, w, h, shift))
+			return -1;
+	}
+	return 0;
+}
+
+/* this contains 22 different copy loops after preprocessing. unfortunately, it's necessary */
+void
+nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h)
+{
+	if(dst->bo != src->bo)
+	{
+		nouveau_bo_map(dst->bo, NOUVEAU_BO_WR);
+		nouveau_bo_map(src->bo, NOUVEAU_BO_RD);
+	}
+	else
+		nouveau_bo_map(dst->bo, NOUVEAU_BO_WR | NOUVEAU_BO_RD);
+
+	uint8_t* mdst = dst->bo->map + dst->offset;
+	uint8_t* msrc = src->bo->map + src->offset;
+
+	int size = w << dst->bpps;
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_CPU [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+
+//	for(int i = 0; i < 16; ++i)
+//		fprintf(stderr, "%02x ", msrc[i]);
+//	fprintf(stderr, "\n");
+#endif
+
+	// TODO: support overlapping copies!
+	if(src->pitch && dst->pitch)
+	{
+		mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
+		msrc += src->y * src->pitch + (src->x << src->bpps);
+		if(dst->bo != src->bo)
+			goto simple;
+		else if(mdst < msrc)
+		{
+			if(mdst + size <= msrc)
+			{
+simple:
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memcpy(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+			else
+			{
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memmove(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+		}
+		else
+		{
+			/* copy backwards so we don't destroy data we have to read yet */
+			if(msrc + size <= mdst)
+			{
+				for(int iy = h - 1; iy >= 0; --iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memcpy(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+			else
+			{
+				for(int iy = h - 1; iy >= 0; --iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memmove(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+		}
+	}
+	else
+	{
+		int* dswx;
+		int* dswy;
+		int* sswx;
+		int* sswy;
+
+		if(!dst->pitch)
+		{
+			dswx = alloca(w * sizeof(int));
+			for(int ix = 0; ix < w; ++ix) // we are adding, so z cannot be contributed by both
+				dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, 0, dst->w, dst->h, dst->d);
+			dswy = alloca(h * sizeof(int));
+			for(int iy = 0; iy < h; ++iy)
+				dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
+		}
+
+		if(!src->pitch)
+		{
+			sswx = alloca(w * sizeof(int));
+			for(int ix = 0; ix < w; ++ix)
+				sswx[ix] = nv04_swizzle_bits(src->x + ix, 0, 0, src->w, src->h, src->d);
+			sswy = alloca(h * sizeof(int));
+			for(int iy = 0; iy < h; ++iy)
+				sswy[iy] = nv04_swizzle_bits(0, src->y + iy, src->z, src->w, src->h, src->d);
+		}
+
+		int dir = 1;
+		/* do backwards copies for overlapping swizzled surfaces */
+		if(dst->pitch == src->pitch && dst->offset == src->offset)
+		{
+			if(dst->y > src->y || (dst->y == src->y && dst->x > src->x))
+				dir = -1;
+		}
+
+#define SWIZZLED_COPY_LOOPS
+		if(dir == 1)
+		{
+			int dir = 1;
+#define LOOP_Y for(int iy = 0; iy < h; ++iy)
+#define LOOP_X for(int ix = 0; ix < w; ++ix)
+#include "nv04_2d_loops.h"
+#undef LOOP_X
+#undef LOOP_Y
+		}
+		else
+		{
+			int dir = -1;
+#define LOOP_Y for(int iy = h - 1; iy >= 0; --iy)
+#define LOOP_X for(int ix = w - 1; ix >= 0; --ix)
+#include "nv04_2d_loops.h"
+#undef LOOP_X
+#undef LOOP_Y
+		}
+#undef SWIZZLED_COPY_LOOP
+	}
+
+	if(src->bo != dst->bo)
+		nouveau_bo_unmap(src->bo);
+	nouveau_bo_unmap(dst->bo);
+}
+
+/* TODO: if the destination is swizzled, we are doing random writes, which causes write combining to fail
+ * the alternative is to read, modify and copy back, which may or may not be faster
+ * loading 3D textures is a common case that hits this and could probably benefit from the temporary
+ */
+void
+nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value)
+{
+	uint8_t* mdst = (nouveau_bo_map(dst->bo, NOUVEAU_BO_WR), dst->bo->map + dst->offset);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_FILL_CPU ");
+	nv04_region_print(dst);
+	fprintf(stderr, "\n");
+#endif
+
+	nv04_region_assert(dst, w, h);
+
+	if(dst->pitch)
+	{
+		unsigned size = w << dst->bpps;
+
+#define FILL(T) do { \
+			for(int iy = 0; iy < h; ++iy) \
+			{ \
+				assert((char*)((T*)mdst + w) <= (char*)dst->bo->map + dst->bo->size); \
+				for(int ix = 0; ix < w; ++ix) \
+					((T*)mdst)[ix] = (T)value; \
+				mdst += dst->pitch; \
+			} \
+		} while(0)
+
+		mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
+
+		if(dst->bpps == 0)
+		{
+ms:
+			assert(mdst + size * h <= (uint8_t*)dst->bo->map + dst->bo->size);
+			if(size == dst->pitch)
+				memset(mdst, (uint8_t)value, size * h);
+			else
+			{
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					memset(mdst, (uint8_t)value, size);
+					mdst += dst->pitch;
+				}
+			}
+		}
+		else if(dst->bpps == 1)
+		{
+			if(!((uint8_t)value ^ (uint8_t)(value >> 8)))
+				goto ms;
+
+			FILL(uint16_t);
+		}
+		else if(dst->bpps == 2)
+		{
+			if(value == (uint8_t)value * 0x1010101)
+				goto ms;
+			FILL(uint32_t);
+		}
+		else
+			assert(0);
+#undef FILL
+	}
+	else
+	{
+		int* dswx;
+		int* dswy;
+
+		dswx = alloca(w * sizeof(int));
+		for(int ix = 0; ix < w; ++ix)
+			dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, dst->z, dst->w, dst->h, dst->d);
+		dswy = alloca(h * sizeof(int));
+		for(int iy = 0; iy < h; ++iy)
+			dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
+
+#define FILL(T) do { \
+			T tvalue = (T)value; \
+			for(int iy = 0; iy < h; ++iy) \
+			{ \
+				T* pdst = (T*)mdst + dswy[iy]; \
+				for(int ix = 0; ix < w; ++ix) \
+				{ \
+					assert((uint8_t*)&pdst[dswx[ix] + 1] <= (uint8_t*)dst->bo->map + dst->bo->size); \
+					pdst[dswx[ix]] = tvalue; \
+				} \
+			} \
+		} while(0)
+
+		if(dst->bpps == 0)
+			FILL(uint8_t);
+		else if(dst->bpps == 1)
+			FILL(uint16_t);
+		else if(dst->bpps == 2)
+			FILL(uint32_t);
+		else
+			assert(0 && "unhandled bpp");
+#undef FILL
+	}
+
+	nouveau_bo_unmap(dst->bo);
+}
+
+static void
+nv04_region_copy_swizzle(struct nv04_2d_context *ctx,
+			  struct nv04_region* dst,
+			  struct nv04_region* src,
+			  int w, int h, int cs2d_format, int sifm_format)
+{
+	struct nouveau_channel *chan = ctx->swzsurf->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	/* Max width & height may not be the same on all HW, but must be POT */
+	unsigned max_shift = 10;
+	unsigned cw = 1 << max_shift;
+	unsigned ch = 1 << max_shift;
+	unsigned sx = dst->x >> max_shift;
+	unsigned sy = dst->y >> max_shift;
+	unsigned ex = (dst->x + w - 1) >> max_shift;
+	unsigned ey = (dst->y + h - 1) >> max_shift;
+	unsigned chunks = (ex - sx + 1) * (ey - sy + 1);
+	if(dst->w < cw)
+		cw = dst->w;
+	if(dst->h < ch)
+		ch = dst->h;
+	unsigned chunk_size = cw * ch << dst->bpps;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_SWIZZLE [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+	MARK_RING (chan, 8 + chunks * 17, 2 + chunks * 2);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst->bo,
+			NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, cs2d_format |
+			 log2i(cw) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+			 log2i(ch) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+
+	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src->bo,
+			 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	assert(!(dst->offset & 63));
+
+	for (int cy = sy; cy <= ey; ++cy) {
+	  int ry = MAX2(0, (int)(dst->y - ch * cy));
+	  int rh = MIN2((int)ch, (int)(dst->y - ch * cy + h)) - ry;
+	  for (int cx = sx; cx <= ex; ++cx) {
+	    int rx = MAX2(0, (int)(dst->x - cw * cx));
+	    int rw = MIN2((int)cw, (int)(dst->x - cw * cx + w)) - rx;
+
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+
+	    unsigned dst_offset = dst->offset + (nv04_swizzle_bits_2d(cx * cw, cy * ch, dst->w, dst->h) << dst->bpps);
+	    assert(dst_offset <= dst->bo->size);
+	    assert(dst_offset + chunk_size <= dst->bo->size);
+	    OUT_RELOCl(chan, dst->bo, dst_offset,
+			    NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, sifm_format);
+	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | rw);
+	    OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | rw);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | align(rw, 8));
+	    OUT_RING  (chan, src->pitch |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    unsigned src_offset = src->offset + (cy * ch + ry + src->y - dst->y) * src->pitch + ((cx * cw + rx + src->x - dst->x) << src->bpps);
+	    assert(src_offset <= src->bo->size);
+	    assert(src_offset + (src->pitch * (rh - 1)) + (rw << src->bpps) <= src->bo->size);
+	    OUT_RELOCl(chan, src->bo, src_offset,
+			     NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+}
+
+static inline void
+nv04_copy_m2mf_begin(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, struct nouveau_bo* srcbo, unsigned commands)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	MARK_RING (chan, 3 + commands * 9, 2 + commands * 2);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, srcbo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dstbo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+}
+
+static inline void
+nv04_copy_m2mf_body(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int* pdstoff, unsigned dstpitch, struct nouveau_bo* srcbo, int* psrcoff, unsigned srcpitch, unsigned size, unsigned lines)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\t\t\tCOPY_M2MF_BODY [%i, %i] <%i[%u]> lin %u <- <%i[%u]> lin %u\n", size, lines, dstbo->handle, *pdstoff, dstpitch, srcbo->handle, *psrcoff, srcpitch);
+#endif
+
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+	OUT_RELOCl(chan, srcbo, *psrcoff,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dstbo, *pdstoff,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+	OUT_RING  (chan, srcpitch);
+	OUT_RING  (chan, dstpitch);
+	OUT_RING  (chan, size);
+	OUT_RING  (chan, lines);
+	OUT_RING  (chan, 0x0101);
+	OUT_RING  (chan, 0);
+
+	*psrcoff += srcpitch * lines;
+	*pdstoff += dstpitch * lines;
+}
+
+static void
+nv04_copy_m2mf(struct nv04_2d_context *ctx,
+		struct nouveau_bo* dstbo, int dstoff, unsigned dstpitch,
+		struct nouveau_bo* srcbo, int srcoff, unsigned srcpitch,
+		unsigned size, unsigned h)
+{
+	unsigned max_pitch = 32767;
+	unsigned max_lines = 2047;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\t\tCOPY_M2MF [%i, %i] <%i[%i]> lin %u <- <%i[%i]> lin %u\n", size, h, dstbo->handle, dstoff, dstpitch, srcbo->handle, srcoff, srcpitch);
+#endif
+
+	if(srcpitch <= max_pitch && dstpitch <= max_pitch)
+	{
+		unsigned full_pages = h / max_lines;
+		unsigned leftover_lines = h - full_pages * max_lines;
+
+		nv04_copy_m2mf_begin(ctx, dstbo, srcbo, full_pages + !!leftover_lines);
+
+		for(unsigned i = 0; i < full_pages; ++i)
+			nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, max_lines);
+
+		if(leftover_lines)
+			nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, leftover_lines);
+	}
+	else
+	{
+		unsigned lines = size / max_pitch;
+		unsigned leftover = size - lines * max_pitch;
+		unsigned full_pages = lines / max_lines;
+		unsigned leftover_lines = lines - full_pages * max_lines;
+		unsigned srcgap = srcpitch - size;
+		unsigned dstgap = dstpitch - size;
+
+		nv04_copy_m2mf_begin(ctx, dstbo, srcbo, h * (full_pages + !!leftover_lines + !!leftover));
+
+		for(unsigned i = 0; i < h; ++i)
+		{
+			for(unsigned j = 0; j < full_pages; ++j)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, max_lines);
+
+			if(leftover_lines)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, leftover_lines);
+
+			if(leftover)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, leftover, srcbo, &srcoff, leftover, leftover, 1);
+
+			srcoff += srcgap;
+			dstoff += dstgap;
+		}
+	}
+}
+
+void
+nv04_memcpy(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int dstoff, struct nouveau_bo* srcbo, int srcoff, unsigned size)
+{
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tMEMCPY [%i] <%i[%i]> <- <%i[%i]>\n", size, dstbo->handle, dstoff, srcbo->handle, srcoff);
+#endif
+
+	nv04_copy_m2mf(ctx, dstbo, dstoff, size, srcbo, srcoff, size, size, 1);
+}
+
+static void
+nv04_region_copy_m2mf(struct nv04_2d_context *ctx, struct nv04_region *dst, struct nv04_region *src, int w, int h)
+{
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_M2MF [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+	assert(src->pitch);
+	assert(dst->pitch);
+
+	nv04_copy_m2mf(ctx,
+			dst->bo, dst->offset + dst->y * dst->pitch + (dst->x << dst->bpps), dst->pitch,
+			src->bo, src->offset + src->y * src->pitch + (src->x << src->bpps), src->pitch,
+			w << src->bpps, h);
+}
+
+static inline void
+nv04_region_copy_blit(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src, int w, int h, int format)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_BLIT [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	assert(!(src->pitch & 63) && src->pitch);
+	assert(!(dst->pitch & 63) && dst->pitch);
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+	MARK_RING (chan, 12, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst->pitch << 16) | src->pitch);
+	OUT_RELOCl(chan, src->bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (src->y << 16) | src->x);
+	OUT_RING  (chan, (dst->y << 16) | dst->x);
+	OUT_RING  (chan, ( h << 16) |  w);
+}
+
+/* THEOREM: a non-linearizable swizzled destination is always 64 byte aligned, except for 4x2 mipmap levels of swizzled 1bpp surfaces
+ * HYPOTESIS:
+ * 1. The first mipmap level is 64-byte-aligned
+ * PROOF:
+ * 1. Thus, all mipmaps level with a parent which is 64-byte or more in size are.
+ * 2. At 1bpp, the smallest levels with a <= 32-byte parent are either Nx1 or 1xN or size <=8, thus 4x2, 2x2 or 2x4
+ * 3. Nx1, 1xN, 2x4, 2x2 have all subrects linearizable. 4x2 does not.
+ * 4. At 2/4bpp or more, the smallest levels with a 32-byte parent are 1xN, Nx1 or 2x2
+ *
+ * However, nv04_region_align handles that.
+ */
+
+// 0 -> done, 1 -> do with 3D engine or CPU, -1 -> do with CPU
+// dst and src may be modified, and the possibly modified version should be passed to nv04_region_cpu if necessary
+int
+nv04_region_copy_2d(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src,
+		int w, int h, int cs2d_format, int sifm_format, int dst_to_gpu, int src_on_gpu)
+{
+	assert(src->bpps == dst->bpps);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "RGN_COPY%s [%i, %i: %i] ", (cs2d_format >= 0) ? "_2D" : "_NO2D", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		int gpu = i ? src_on_gpu : dst_to_gpu;
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, " %s", gpu ? "gpu" : "cpu");
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	// if they are contiguous and either both swizzled or both linear, reshape
+	if(!dst->pitch == !src->pitch
+		&& nv04_region_is_contiguous(dst, w, h)
+		&& nv04_region_is_contiguous(src, w, h))
+	{
+		nv04_region_contiguous_shape(dst, &w, &h, 6);
+		nv04_region_linearize_contiguous(dst, w, h);
+		nv04_region_linearize_contiguous(src, w, h);
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tOPT ");
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	/* if the destination is not for GPU _and_ source is on CPU, use CPU */
+	/* if the destination is not for GPU _or_ source is on CPU, use CPU only if we think it's faster than the GPU */
+	/* TODO: benchmark to find out in which cases exactly we should prefer the CPU */
+	 if((!dst_to_gpu && !src_on_gpu)
+		|| (!dst->pitch && dst->d > 1)
+		/* 3D swizzled destination are unwritable by the GPU, and 2D swizzled ones are readable only by the 3D engine */
+	 )
+		 return -1;
+	/* there is no known way to read 2D/3D-swizzled surfaces with the 2D engine
+	 * ask the caller to use the 3D engine
+	 * If a format cannot be sampled from the 3D engine there is no point in making it swizzled, so we must not do so
+	 */
+	 else if(!src->pitch)
+	 {
+#ifdef NV04_REGION_DEBUG
+		fprintf(stderr, "\tCOPY_ENG3D\n");
+#endif
+		 return 1;
+	 }
+	/* Setup transfer to swizzle the texture to vram if needed */
+	else
+	{
+		if (!dst->pitch)
+		{
+			if(cs2d_format < 0 || sifm_format < 0 || !dst_to_gpu)
+			{
+#ifdef NV04_REGION_DEBUG
+				fprintf(stderr, "\tCOPY_ENG3D\n");
+#endif
+				return 1;
+			}
+			else
+			{
+				assert(!nv04_region_align(dst, w, h, 6));
+
+				nv04_region_copy_swizzle(ctx, dst, src, w, h, cs2d_format, sifm_format);
+				return 0;
+			}
+		}
+		else
+		{
+			/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+			 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+			 * TODO: is this also true for the source? possibly not
+			 */
+
+			if ((cs2d_format < 0)
+				|| !dst_to_gpu
+				|| nv04_region_align(src, w, h, 6)
+				|| nv04_region_align(dst, w, h, 6)
+				)
+				nv04_region_copy_m2mf(ctx, dst, src, w, h);
+			else
+				nv04_region_copy_blit(ctx, dst, src, w, h, cs2d_format);
+
+			return 0;
+		}
+	}
+}
+
+static inline void
+nv04_region_fill_gdirect(struct nv04_2d_context *ctx, struct nv04_region* dst, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	int cs2d_format, gdirect_format;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tFILL_GDIRECT\n");
+#endif
+
+	assert(!(dst->pitch & 63) && dst->pitch);
+	nv04_region_assert(dst, w, h);
+
+	if(dst->bpps == 0)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	}
+	else if(dst->bpps == 1)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
+	}
+	else if(dst->bpps == 2)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	}
+	else
+		assert(0);
+
+	MARK_RING (chan, 15, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst->pitch << 16) | dst->pitch);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dst->x << 16) | dst->y);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+int
+nv04_region_fill_2d(struct nv04_2d_context *ctx, struct nv04_region *dst,
+		  int w, int h, unsigned value)
+{
+	if(!w || !h)
+		return 0;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "FILL [%i, %i: %i] ", w, h, dst->bpps);
+	nv04_region_print(dst);
+	fprintf(stderr, " <- 0x%x\n", value);
+#endif
+
+	if(nv04_region_is_contiguous(dst, w, h))
+	{
+		nv04_region_contiguous_shape(dst, &w, &h, 6);
+		nv04_region_linearize_contiguous(dst, w, h);
+	}
+
+	// TODO: maybe do intermediate copies for some cases instead of using the 3D engine/CPU
+	/* GdiRect doesn't work together with swzsurf, so the 3D engine, or an intermediate copy, is the only option here */
+	if(!dst->pitch)
+	{
+#ifdef NV04_REGION_DEBUG
+		fprintf(stderr, "\tFILL_ENG3D\n");
+#endif
+		return 1;
+	}
+	else if(!nv04_region_align(dst, w, h, 6))
+	{
+		nv04_region_fill_gdirect(ctx, dst, w, h, value);
+		return 0;
+	}
+	else
+		return -1;
+}
+
+
+void
+nv04_2d_context_takedown(struct nv04_2d_context *ctx)
+{
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	free(ctx);
+}
+
+struct nv04_2d_context *
+nv04_2d_context_init(struct nouveau_channel* chan)
+{
+	struct nv04_2d_context *ctx = calloc(1, sizeof(struct nv04_2d_context));
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	/* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
+	if(ctx->swzsurf->bound == NOUVEAU_GROBJ_UNBOUND)
+		nouveau_grobj_autobind(ctx->swzsurf);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x30:
+		class = NV30_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	default:
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	/* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
+	if(ctx->sifm->bound == NOUVEAU_GROBJ_UNBOUND)
+		nouveau_grobj_autobind(ctx->sifm);
+
+	return ctx;
+}
diff --git a/src/gallium/drivers/nvfx/nv04_2d.h b/src/gallium/drivers/nvfx/nv04_2d.h
new file mode 100644
index 00000000000..e638b8c8740
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d.h
@@ -0,0 +1,87 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Ben Skeggs
+ * Copyright 2009 Younes Manton
+ * Copyright 2010 Luca Barbieri
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
+
+#ifndef __NV04_2D_H__
+#define __NV04_2D_H__
+
+struct nv04_2d_context;
+struct nouveau_channel;
+struct nouveau_bo;
+
+// NOTE: all functions taking this as a parameter will CLOBBER it (except for ->bo)
+struct nv04_region {
+	struct nouveau_bo* bo;
+	int offset;
+	unsigned pitch; // 0 -> swizzled
+	unsigned bpps; // bpp shift (0, 1, 2; 3, 4 for fp/compressed)
+	unsigned x, y, z;
+	unsigned w, h, d;
+};
+
+void
+nv04_memcpy(struct nv04_2d_context *ctx,
+		struct nouveau_bo* dstbo, int dstoff,
+		struct nouveau_bo* srcbo, int srcoff,
+		unsigned size);
+
+unsigned
+nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h);
+
+unsigned
+nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h);
+
+void
+nv04_2d_context_takedown(struct nv04_2d_context *pctx);
+
+struct nv04_2d_context *
+nv04_2d_context_init(struct nouveau_channel* chan);
+
+void
+nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h);
+
+void
+nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value);
+
+int
+nv04_region_copy_2d(struct nv04_2d_context *ctx,
+		struct nv04_region* dst, struct nv04_region* src,
+		int w, int h,
+		int cs2d_format, int sifm_format,
+		int dst_to_gpu, int src_on_gpu);
+
+int
+nv04_region_fill_2d(struct nv04_2d_context *ctx,
+		struct nv04_region *dst,
+                int w, int h,
+                unsigned value);
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv04_2d_loops.h b/src/gallium/drivers/nvfx/nv04_2d_loops.h
new file mode 100644
index 00000000000..3a6787c0717
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d_loops.h
@@ -0,0 +1,70 @@
+#ifndef T
+{
+	if(dst->bpps == 0)
+#define T uint8_t
+#include "nv04_2d_loops.h"
+#undef T
+	else if(dst->bpps == 1)
+#define T uint16_t
+#include "nv04_2d_loops.h"
+#undef T
+	else if(dst->bpps == 2)
+#define T uint32_t
+#include "nv04_2d_loops.h"
+#undef T
+	else
+		assert(0);
+}
+#else
+#ifdef SWIZZLED_COPY_LOOPS
+{
+	if(!dst->pitch)
+	{
+		if(!src->pitch)
+		{
+			LOOP_Y
+			{
+				T* pdst = (T*)mdst + dswy[iy];
+				T* psrc = (T*)msrc + sswy[iy];
+				LOOP_X
+				{
+					assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
+					assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
+					pdst[dswx[ix]] = psrc[sswx[ix]];
+				}
+			}
+		}
+		else
+		{
+			T* psrc = (T*)(msrc + ((dir > 0) ? src->y : (src->y + h - 1)) * src->pitch) + src->x;
+			LOOP_Y
+			{
+				T* pdst = (T*)mdst + dswy[iy];
+				LOOP_X
+				{
+					assert((char*)&psrc[ix + 1] <= ((char*)src->bo->map + src->bo->size));
+					assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
+					pdst[dswx[ix]] = psrc[ix];
+				}
+				psrc = (T*)((char*)psrc + dir * src->pitch);
+			}
+		}
+	}
+	else
+	{
+		T* pdst = (T*)(mdst + ((dir > 0) ? dst->y : (dst->y + h - 1)) * dst->pitch) + dst->x;
+		LOOP_Y
+		{
+			T* psrc = (T*)msrc + sswy[iy];
+			LOOP_X
+			{
+				assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
+				assert((char*)&pdst[ix + 1] <= ((char*)dst->bo->map + dst->bo->size));
+				pdst[ix] = psrc[sswx[ix]];
+			}
+			pdst = (T*)((char*)pdst + dir * dst->pitch);
+		}
+	}
+}
+#endif
+#endif
-- 
2.30.2