From 5ba92a5b0543b4ff2c7db6101029ba36cb9843fa Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 29 Jan 2009 16:47:37 +1000
Subject: [PATCH] radeon/r200/r300: bring back old style DMA buffer on top of
 BOs.

this gets back a lot of the lots speed in gears on r500 at least

I also fixed the legacy bufmgr to deal when the dma space fills up
---
 src/mesa/drivers/dri/r200/Makefile            |   1 -
 src/mesa/drivers/dri/r200/r200_context.c      |   7 +-
 src/mesa/drivers/dri/r200/r200_context.h      |   4 -
 src/mesa/drivers/dri/r200/r200_maos_arrays.c  | 104 ----
 src/mesa/drivers/dri/r200/r200_texmem.c       | 531 ------------------
 src/mesa/drivers/dri/r300/r300_context.c      |   1 +
 src/mesa/drivers/dri/r300/r300_context.h      |   1 -
 src/mesa/drivers/dri/r300/r300_emit.c         |  52 +-
 src/mesa/drivers/dri/r300/r300_ioctl.c        |  10 +-
 src/mesa/drivers/dri/r300/r300_render.c       |   6 +-
 src/mesa/drivers/dri/r300/r300_swtcl.c        |  89 ++-
 src/mesa/drivers/dri/radeon/common_context.h  |  33 +-
 src/mesa/drivers/dri/radeon/common_misc.c     |  85 ++-
 src/mesa/drivers/dri/radeon/common_misc.h     |   4 +
 .../drivers/dri/radeon/radeon_bo_legacy.c     |  34 +-
 src/mesa/drivers/dri/radeon/radeon_context.c  |   6 +-
 src/mesa/drivers/dri/radeon/radeon_context.h  |   1 -
 src/mesa/drivers/dri/radeon/radeon_ioctl.c    |  14 +-
 src/mesa/drivers/dri/radeon/radeon_ioctl.h    |  16 +-
 19 files changed, 244 insertions(+), 755 deletions(-)
 delete mode 100644 src/mesa/drivers/dri/r200/r200_texmem.c

diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index 5f1cfc889ee..d96999f7bbb 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -18,7 +18,6 @@ DRIVER_SOURCES = r200_context.c \
 		 r200_cmdbuf.c \
 		 r200_pixel.c \
 		 r200_tex.c \
-		 r200_texmem.c \
 		 r200_texstate.c \
 		 r200_tcl.c \
 		 r200_swtcl.c \
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 9f1edd39d3d..53c64eb5a38 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -365,9 +365,6 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    }
 
 
-   if (!rmesa->radeon.radeonScreen->kernel_mm)
-       rmesa->dma.buf0_address = rmesa->radeon.radeonScreen->buffers->list[0].address;
-
    (void) memset( rmesa->radeon.texture_heaps, 0, sizeof( rmesa->radeon.texture_heaps ) );
    make_empty_list( & rmesa->radeon.swapped );
 
@@ -587,8 +584,8 @@ void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
       r200DestroySwtcl( rmesa->radeon.glCtx );
       r200ReleaseArrays( rmesa->radeon.glCtx, ~0 );
 
-      if (rmesa->dma.current.buf) {
-	//	 r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+      if (rmesa->dma.current) {
+	 radeonReleaseDmaRegion(rmesa);
 	 rcommonFlushCmdBuf( &rmesa->radeon, __FUNCTION__ );
       }
 
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index 62847be522e..80981135ac6 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -525,10 +525,6 @@ struct r200_state {
    GLuint envneeded;
 };
 
-#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-
 #define R200_CMD_BUF_SZ  (16*1024) 
 
 #define R200_ELT_BUF_SZ  (16*1024) 
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index bcdc4336abd..278e00442c8 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -50,110 +50,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_maos.h"
 #include "r200_tcl.h"
 
-
-#if 0
-/* Usage:
- *   - from r200_tcl_render
- *   - call r200EmitArrays to ensure uptodate arrays in dma
- *   - emit primitives (new type?) which reference the data
- *       -- need to use elts for lineloop, quads, quadstrip/flat
- *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
- *
- */
-static void emit_ubyte_rgba3( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   r200_color_t *out = (r200_color_t *)(rvb->start + rvb->address);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p\n",
-	      __FUNCTION__, count, stride, (void *)out);
-
-   for (i = 0; i < count; i++) {
-      out->red   = *data;
-      out->green = *(data+1);
-      out->blue  = *(data+2);
-      out->alpha = 0xFF;
-      out++;
-      data += stride;
-   }
-}
-
-static void emit_ubyte_rgba4( GLcontext *ctx,
-			      struct radeon_dma_region *rvb,
-			      char *data,
-			      int stride,
-			      int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 4) {
-      for (i = 0; i < count; i++)
-	 ((int *)out)[i] = LE32_TO_CPU(((int *)data)[i]);
-   } else {
-      for (i = 0; i < count; i++) {
-	 *(int *)out++ = LE32_TO_CPU(*(int *)data);
-	 data += stride;
-      }
-   }
-}
-
-
-static void emit_ubyte_rgba( GLcontext *ctx,
-			     struct radeon_dma_region *rvb,
-			     char *data,
-			     int size,
-			     int stride,
-			     int count )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-     //       r200AllocDmaRegion( rmesa, rvb, 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
-   }
-   else {
-     //      r200AllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 3:
-      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-}
-#endif
-
-
 #if defined(USE_X86_ASM)
 #define COPY_DWORDS( dst, src, nr )					\
 do {									\
diff --git a/src/mesa/drivers/dri/r200/r200_texmem.c b/src/mesa/drivers/dri/r200/r200_texmem.c
deleted file mode 100644
index a50786e2e78..00000000000
--- a/src/mesa/drivers/dri/r200/r200_texmem.c
+++ /dev/null
@@ -1,531 +0,0 @@
-/**************************************************************************
-
-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.  
-The Weather Channel, Inc. funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86
-license. This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <martin@valinux.com>
- *   Gareth Hughes <gareth@valinux.com>
- *
- */
- 
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/macros.h"
-#include "r200_context.h"
-#include "r200_ioctl.h"
-#include "r200_tex.h"
-#include "radeon_reg.h"
-
-#include <unistd.h>  /* for usleep() */
-
-#if 0
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void
-r200DestroyTexObj( r200ContextPtr rmesa, radeonTexObjPtr t )
-{
-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, 
-	       (void *)t, (void *)t->base.tObj );
-   }
-
-   if ( rmesa != NULL ) {
-      unsigned   i;
-
-
-      for ( i = 0 ; i < rmesa->radeon.glCtx->Const.MaxTextureUnits ; i++ ) {
-	 if ( t == rmesa->state.texture.unit[i].texobj ) {
-	    rmesa->state.texture.unit[i].texobj = NULL;
-	    rmesa->hw.tex[i].dirty = GL_FALSE;
-	    rmesa->hw.cube[i].dirty = GL_FALSE;
-	 }
-      }
-   }
-}
-
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-
-static void r200UploadGARTClientSubImage( r200ContextPtr rmesa,
-					  radeonTexObjPtr t, 
-					  struct gl_texture_image *texImage,
-					  GLint hwlevel,
-					  GLint x, GLint y, 
-					  GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   GLuint srcPitch, dstPitch;
-   int blit_format;
-   int srcOffset;
-
-   /*
-    * XXX it appears that we always upload the full image, not a subimage.
-    * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
-    * changed, the src pitch will have to change.
-    */
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = R200_CP_COLOR_FORMAT_CI8;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   case 2:
-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   case 4:
-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   default:
-      return;
-   }
-
-   t->image[0][hwlevel].data = texImage->Data;
-   srcOffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
-
-   assert( srcOffset != ~0 );
-
-   /* Don't currently need to cope with small pitches?
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-
-   r200EmitWait( rmesa, RADEON_WAIT_3D );
-
-   r200EmitBlit( rmesa, blit_format, 
-		 srcPitch,  
-		 srcOffset,   
-		 dstPitch,
-		 t->bufAddr,
-		 x, 
-		 y, 
-		 t->image[0][hwlevel].x + x,
-		 t->image[0][hwlevel].y + y, 
-		 width,
-		 height );
-
-   r200EmitWait( rmesa, RADEON_WAIT_2D );
-}
-
-static void r200UploadRectSubImage( r200ContextPtr rmesa,
-				    radeonTexObjPtr t, 
-				    struct gl_texture_image *texImage,
-				    GLint x, GLint y, 
-				    GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   int blit_format, dstPitch, done;
-
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = R200_CP_COLOR_FORMAT_CI8;
-      break;
-   case 2:
-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
-      break;
-   case 4:
-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
-      break;
-   default:
-      return;
-   }
-
-   t->image[0][0].data = texImage->Data;
-
-   /* Currently don't need to cope with small pitches.
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-   dstPitch = t->pp_txpitch + 32;
-
-   if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
-      /* In this case, could also use GART texturing.  This is
-       * currently disabled, but has been tested & works.
-       */
-      if ( !t->image_override )
-         t->pp_txoffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
-      t->pp_txpitch = texImage->RowStride * texFormat->TexelBytes - 32;
-
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, 
-		 "Using GART texturing for rectangular client texture\n");
-
-      /* Release FB memory allocated for this image:
-       */
-      /* FIXME This may not be correct as driSwapOutTextureObject sets
-       * FIXME dirty_images.  It may be fine, though.
-       */
-      if ( t->base.memBlock ) {
-	 driSwapOutTextureObject( (driTextureObject *) t );
-      }
-   }
-   else if (texImage->IsClientData) {
-      /* Data already in GART memory, with usable pitch.
-       */
-      GLuint srcPitch;
-      srcPitch = texImage->RowStride * texFormat->TexelBytes;
-      r200EmitBlit( rmesa, 
-		    blit_format, 
-		    srcPitch,
-		    r200GartOffsetFromVirtual( rmesa, texImage->Data ),   
-		    dstPitch, t->bufAddr,
-		    0, 0, 
-		    0, 0, 
-		    width, height );
-   }
-   else {
-      /* Data not in GART memory, or bad pitch.
-       */
-      for (done = 0; done < height ; ) {
-	 struct radeon_dma_region region;
-	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
-	 int src_pitch;
-	 char *tex;
-
-         src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-	 tex = (char *)texImage->Data + done * src_pitch;
-
-	 memset(&region, 0, sizeof(region));
-	 //	 r200AllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
-
-	 /* Copy texdata to dma:
-	  */
-	 if (0)
-	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
-		    __FUNCTION__, src_pitch, dstPitch);
-
-	 if (src_pitch == dstPitch) {
-	    memcpy( region.address + region.start, tex, lines * src_pitch );
-	 } 
-	 else {
-	    char *buf = region.address + region.start;
-	    int i;
-	    for (i = 0 ; i < lines ; i++) {
-	       memcpy( buf, tex, src_pitch );
-	       buf += dstPitch;
-	       tex += src_pitch;
-	    }
-	 }
-
-	 r200EmitWait( rmesa, RADEON_WAIT_3D );
-
-	 /* Blit to framebuffer
-	  */
-	 r200EmitBlit( rmesa,
-		       blit_format,
-		       dstPitch, GET_START( &region ),
-		       dstPitch | (t->tile_bits >> 16),
-		       t->bufAddr,
-		       0, 0,
-		       0, done,
-		       width, lines );
-	 
-	 r200EmitWait( rmesa, RADEON_WAIT_2D );
-
-	 //	 r200ReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
-	 done += lines;
-      }
-   }
-}
-
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void uploadSubImage( r200ContextPtr rmesa, radeonTexObjPtr t, 
-			    GLint hwlevel,
-			    GLint x, GLint y, GLint width, GLint height,
-			    GLuint face )
-{
-   struct gl_texture_image *texImage = NULL;
-   GLuint offset;
-   GLint imageWidth, imageHeight;
-   GLint ret;
-   drm_radeon_texture_t tex;
-   drm_radeon_tex_image_t tmp;
-   const int level = hwlevel + t->base.firstLevel;
-
-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
-	       __FUNCTION__, (void *)t, (void *)t->base.tObj,
-	       level, width, height, face );
-   }
-
-   ASSERT(face < 6);
-
-   /* Ensure we have a valid texture to upload */
-   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
-      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-      return;
-   }
-
-   texImage = t->base.tObj->Image[face][level];
-
-   if ( !texImage ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
-      return;
-   }
-   if ( !texImage->Data ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
-      return;
-   }
-
-
-   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-      assert(level == 0);
-      assert(hwlevel == 0);
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
-      r200UploadRectSubImage( rmesa, t, texImage, x, y, width, height );
-      return;
-   }
-   else if (texImage->IsClientData) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is in GART client storage\n",
-		  __FUNCTION__);
-      r200UploadGARTClientSubImage( rmesa, t, texImage, hwlevel,
-				   x, y, width, height );
-      return;
-   }
-   else if ( R200_DEBUG & DEBUG_TEXTURE )
-      fprintf( stderr, "%s: image data is in normal memory\n",
-	       __FUNCTION__);
-      
-
-   imageWidth = texImage->Width;
-   imageHeight = texImage->Height;
-
-   offset = t->bufAddr + t->base.totalSize / 6 * face;
-
-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      GLint imageX = 0;
-      GLint imageY = 0;
-      GLint blitX = t->image[face][hwlevel].x;
-      GLint blitY = t->image[face][hwlevel].y;
-      GLint blitWidth = t->image[face][hwlevel].width;
-      GLint blitHeight = t->image[face][hwlevel].height;
-      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
-	       imageWidth, imageHeight, imageX, imageY );
-      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
-	       blitWidth, blitHeight, blitX, blitY );
-      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-	       (GLuint)offset, hwlevel, level );
-   }
-
-   t->image[face][hwlevel].data = texImage->Data;
-
-   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-    * We used to use 1, 2 and 4-byte texels and used to use the texture
-    * width to dictate the blit width - but that won't work for compressed
-    * textures. (Brian)
-    * NOTE: can't do that with texture tiling. (sroland)
-    */
-   tex.offset = offset;
-   tex.image = &tmp;
-   /* copy (x,y,width,height,data) */
-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(tmp) );
-   
-   if (texImage->TexFormat->TexelBytes) {
-      /* use multi-byte upload scheme */
-      tex.height = imageHeight;
-      tex.width = imageWidth;
-      tex.format = t->pp_txformat & R200_TXFORMAT_FORMAT_MASK;
-      if (tex.format == R200_TXFORMAT_ABGR8888) {
-	 /* drm will refuse abgr8888 textures. */
-	 tex.format = R200_TXFORMAT_ARGB8888;
-      }
-      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
-      tex.offset += tmp.x & ~1023;
-      tmp.x = tmp.x % 1024;
-      if (t->tile_bits & R200_TXO_MICRO_TILE) {
-	 /* need something like "tiled coordinates" ? */
-	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
-	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
-	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-      }
-      else {
-	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-      }
-      if ((t->tile_bits & R200_TXO_MACRO_TILE) &&
-	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256) &&
-	 ((!(t->tile_bits & R200_TXO_MICRO_TILE) && (texImage->Height >= 8)) ||
-	    (texImage->Height >= 16))) {
-	 /* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
-	    OR if height is smaller than 8 automatically, but if micro tiling is active
-	    the limit is height 16 instead ? */
-	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-      }
-   }
-   else {
-      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
-         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
-         so the kernel module reads the right amount of data. */
-      tex.format = R200_TXFORMAT_I8; /* any 1-byte texel format */
-      tex.pitch = (BLIT_WIDTH_BYTES / 64);
-      tex.height = (imageHeight + 3) / 4;
-      tex.width = (imageWidth + 3) / 4;
-      switch (t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) {
-      case R200_TXFORMAT_DXT1:
-           tex.width *= 8;
-           break;
-      case R200_TXFORMAT_DXT23:
-      case R200_TXFORMAT_DXT45:
-           tex.width *= 16;
-           break;
-      default:
-          fprintf(stderr, "unknown compressed tex format in uploadSubImage\n");
-      }
-   }
-
-   LOCK_HARDWARE( &rmesa->radeon );
-   do {
-      ret = drmCommandWriteRead( rmesa->radeon.dri.fd, DRM_RADEON_TEXTURE,
-                                 &tex, sizeof(drm_radeon_texture_t) );
-      if (ret) {
-	 if (R200_DEBUG & DEBUG_IOCTL)
-	    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
-	 usleep(1);
-      }
-   } while ( ret == -EAGAIN );
-
-   UNLOCK_HARDWARE( &rmesa->radeon );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
-      fprintf( stderr, "   offset=0x%08x\n",
-	       offset );
-      fprintf( stderr, "   image width=%d height=%d\n",
-	       imageWidth, imageHeight );
-      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
-	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
-	       t->image[face][hwlevel].data );
-      exit( 1 );
-   }
-}
-
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- * 
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int r200UploadTexImages( r200ContextPtr rmesa, radeonTexObjPtr t, GLuint face )
-{
-   const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-	       (void *)rmesa->radeon.glCtx, (void *)t->base.tObj, t->base.totalSize,
-	       t->base.firstLevel, t->base.lastLevel );
-   }
-
-   if ( !t || t->base.totalSize == 0 || t->image_override )
-      return 0;
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      r200Finish( rmesa->radeon.glCtx );
-   }
-
-   LOCK_HARDWARE( &rmesa->radeon );
-
-   if ( t->base.memBlock == NULL ) {
-      int heap;
-
-      heap = driAllocateTexture( rmesa->radeon.texture_heaps, rmesa->radeon.nr_heaps,
-				 (driTextureObject *) t );
-      if ( heap == -1 ) {
-	 UNLOCK_HARDWARE( &rmesa->radeon );
-	 return -1;
-      }
-
-      /* Set the base offset of the texture image */
-      t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap] 
-	   + t->base.memBlock->ofs;
-      t->pp_txoffset = t->bufAddr;
-       
-      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-	 /* hope it's safe to add that here... */
-	 t->pp_txoffset |= t->tile_bits;
-      }
-
-      /* Mark this texobj as dirty on all units:
-       */
-      t->dirty_state = R200_TEX_ALL;
-   }
-
-   /* Let the world know we've used this memory recently.
-    */
-   driUpdateTextureLRU( (driTextureObject *) t );
-   UNLOCK_HARDWARE( &rmesa->radeon );
-
-   /* Upload any images that are new */
-   if (t->base.dirty_images[face]) {
-      int i;
-      for ( i = 0 ; i < numLevels ; i++ ) {
-         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
-            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
-			    t->image[face][i].height, face );
-         }
-      }
-      t->base.dirty_images[face] = 0;
-   }
-
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      r200Finish( rmesa->radeon.glCtx );
-   }
-
-   return 0;
-}
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index eb8e481a18e..f57952d8780 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -259,6 +259,7 @@ static void r300_init_vtbl(radeonContextPtr radeon)
    radeon->vtbl.update_draw_buffer = r300UpdateDrawBuffer;
    radeon->vtbl.emit_cs_header = r300_vtbl_emit_cs_header;
    radeon->vtbl.emit_state = r300_vtbl_emit_state;
+   radeon->vtbl.flush_vertices = r300_vtbl_flush_vertices;
 }
 
 
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 8f67460e420..155529a8a67 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -699,7 +699,6 @@ struct r300_swtcl_info {
    GLuint specoffset;
 
    struct radeon_bo *bo;
-   void (*flush) (r300ContextPtr);
 };
 
 
diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
index e2d90a843ef..c47f19ea11e 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_emit.c
@@ -301,28 +301,28 @@ int r300EmitArrays(GLcontext * ctx)
 	}
 
 	/* Setup INPUT_ROUTE. */
-    if (rmesa->radeon.radeonScreen->kernel_mm) {
-      R300_STATECHANGE(rmesa, vir[0]);
-      rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
-      rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
-	rmesa->hw.vir[0].cmd[0] |=
-        (r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-                            vb->AttribPtr, inputs, tab, nr) & 0x3FFF) << 16;
-	R300_STATECHANGE(rmesa, vir[1]);
-	rmesa->hw.vir[1].cmd[0] |=
-	    (r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-	                        nr) & 0x3FFF) << 16;
-    } else {
-	R300_STATECHANGE(rmesa, vir[0]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-	    r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-			       vb->AttribPtr, inputs, tab, nr);
-	R300_STATECHANGE(rmesa, vir[1]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-	    r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-			       nr);
-    }
-
+	if (rmesa->radeon.radeonScreen->kernel_mm) {
+		R300_STATECHANGE(rmesa, vir[0]);
+		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[0].cmd[0] |=
+			(r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
+					    vb->AttribPtr, inputs, tab, nr) & 0x3FFF) << 16;
+		R300_STATECHANGE(rmesa, vir[1]);
+		rmesa->hw.vir[1].cmd[0] |=
+			(r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
+					    nr) & 0x3FFF) << 16;
+	} else {
+		R300_STATECHANGE(rmesa, vir[0]);
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
+			r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
+					   vb->AttribPtr, inputs, tab, nr);
+		R300_STATECHANGE(rmesa, vir[1]);
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
+			r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
+					   nr);
+	}
+	
 	/* Setup INPUT_CNTL. */
 	R300_STATECHANGE(rmesa, vic);
 	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
@@ -337,6 +337,8 @@ int r300EmitArrays(GLcontext * ctx)
 
 	rmesa->state.aos_count = nr;
 
+	radeon_bo_unmap(rmesa->radeon.dma.current);
+
 	return R300_FALLBACK_NONE;
 }
 
@@ -347,13 +349,15 @@ void r300ReleaseArrays(GLcontext * ctx)
 
 	if (rmesa->state.elt_dma_bo) {
 		radeon_bo_unref(rmesa->state.elt_dma_bo);
-		rmesa->state.elt_dma_bo = 0;
+		rmesa->state.elt_dma_bo = NULL;
 	}
 	for (i = 0; i < rmesa->state.aos_count; i++) {
 		if (rmesa->state.aos[i].bo) {
-			rmesa->state.aos[i].bo = radeon_bo_unref(rmesa->state.aos[i].bo);
+			radeon_bo_unref(rmesa->state.aos[i].bo);
+			rmesa->state.aos[i].bo = NULL;
 		}
 	}
+	radeonReleaseDmaRegion(&rmesa->radeon);
 }
 
 void r300EmitCacheFlush(r300ContextPtr rmesa)
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index d12fde175bc..b0a579bf840 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -393,7 +393,7 @@ static void r300EmitClearState(GLcontext * ctx)
 		R300_STATECHANGE(r300, fp);
 		R300_STATECHANGE(r300, r500fp);
 
-		BEGIN_BATCH(14);
+		BEGIN_BATCH(7);
 		OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
 		OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
 		OUT_BATCH(0x0);
@@ -619,10 +619,10 @@ void r300Flush(GLcontext * ctx)
 	if (RADEON_DEBUG & DEBUG_IOCTL)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-    if (rmesa->swtcl.flush) {
-        rmesa->swtcl.flush(rmesa);
-    }
-
+	if (rmesa->radeon.dma.flush) {
+		rmesa->radeon.dma.flush(ctx);
+	}
+	
 	if (rmesa->radeon.cmdbuf.cs->cdw) {
 		rcommonFlushCmdBuf(&rmesa->radeon, __FUNCTION__);
 	}
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index ef3671eadbd..57249c46ef8 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -177,10 +177,8 @@ static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	void *out;
 
-	rmesa->state.elt_dma_bo = radeon_bo_open(rmesa->radeon.radeonScreen->bom,
-                                             0, n_elts * 4, 4,
-                                             RADEON_GEM_DOMAIN_GTT, 0);
-	rmesa->state.elt_dma_offset = 0;
+	radeonAllocDmaRegion(&rmesa->radeon, &rmesa->state.elt_dma_bo,
+			     &rmesa->state.elt_dma_offset, n_elts * 4, 4);
 	radeon_bo_map(rmesa->state.elt_dma_bo, 1);
 	out = rmesa->state.elt_dma_bo->ptr + rmesa->state.elt_dma_offset;
 	memcpy(out, elts, n_elts * 4);
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
index eb86bd3bdd6..1ce51b21f3a 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
@@ -57,7 +57,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_ioctl.h"
 #include "r300_emit.h"
 
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  );
+static void flush_last_swtcl_prim( GLcontext *ctx);
 
 
 void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct radeon_bo *bo, GLuint offset);
@@ -241,26 +241,45 @@ static void r300SetVertexFormat( GLcontext *ctx )
 
 /* Flush vertices in the current dma region.
  */
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
+static void flush_last_swtcl_prim( GLcontext *ctx  )
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct radeon_dma *dma = &rmesa->radeon.dma;
+		
+
 	if (RADEON_DEBUG & DEBUG_IOCTL)
 		fprintf(stderr, "%s\n", __FUNCTION__);
-    rmesa->swtcl.flush = NULL;
-    radeon_bo_unmap(rmesa->swtcl.bo);
-    rcommonEnsureCmdBufSpace(rmesa,
-			     rmesa->hw.max_state_size + (12*sizeof(int)),
-			     __FUNCTION__);
-    r300EmitState(rmesa);
-    r300EmitVertexAOS(rmesa,
-                      rmesa->swtcl.vertex_size,
-                      rmesa->swtcl.bo,
-                      0);
-    r300EmitVbufPrim(rmesa,
-                     rmesa->swtcl.hw_primitive,
-                     rmesa->swtcl.numverts);
-    r300EmitCacheFlush(rmesa);
-    COMMIT_BATCH();
-    rmesa->swtcl.numverts = 0;
+	dma->flush = NULL;
+
+	if (dma->current) {
+	    GLuint current_offset = dma->current_used;
+
+	    assert (dma->current_used +
+		    rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+		    dma->current_vertexptr);
+
+	    radeon_bo_unmap(dma->current);
+	    if (dma->current_used != dma->current_vertexptr) {
+		    dma->current_used = dma->current_vertexptr;
+
+		    rcommonEnsureCmdBufSpace(rmesa,
+					     rmesa->hw.max_state_size + (12*sizeof(int)),
+					     __FUNCTION__);
+		    r300EmitState(rmesa);
+		    r300EmitVertexAOS(rmesa,
+				      rmesa->swtcl.vertex_size,
+				      dma->current,
+				      current_offset);
+
+		    r300EmitVbufPrim(rmesa,
+				     rmesa->swtcl.hw_primitive,
+				     rmesa->swtcl.numverts);
+		    r300EmitCacheFlush(rmesa);
+		    COMMIT_BATCH();
+	    }
+	    radeonReleaseDmaRegion(&rmesa->radeon);
+	    rmesa->swtcl.numverts = 0;
+	}
 }
 
 /* Alloc space in the current dma region.
@@ -269,15 +288,29 @@ static void *
 r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
 {
 	GLuint bytes = vsize * nverts;
+	void *head;
 
-	rmesa->swtcl.bo = radeon_bo_open(rmesa->radeon.radeonScreen->bom,
-					 0, bytes, 4, RADEON_GEM_DOMAIN_GTT, 0);
-	radeon_bo_map(rmesa->swtcl.bo, 1);
-	if (rmesa->swtcl.flush == NULL) {
-	  rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-	  rmesa->swtcl.flush = flush_last_swtcl_prim;
+	if (!rmesa->radeon.dma.current || rmesa->radeon.dma.current_vertexptr + bytes > rmesa->radeon.dma.current->size) {
+                radeonRefillCurrentDmaRegion( &rmesa->radeon, bytes);
 	}
-	return rmesa->swtcl.bo->ptr;
+
+        if (!rmesa->radeon.dma.flush) {
+                rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+                rmesa->radeon.dma.flush = flush_last_swtcl_prim;
+        }
+
+	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
+        ASSERT( rmesa->radeon.dma.flush == flush_last_swtcl_prim );
+        ASSERT( rmesa->radeon.dma.current_used +
+                rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+                rmesa->dma.current_vertexptr );
+
+//	fprintf(stderr,"current %p %x\n", rmesa->radeon.dma.current->ptr,
+//		rmesa->radeon.dma.current_vertexptr);
+	head = (rmesa->radeon.dma.current->ptr + rmesa->radeon.dma.current_vertexptr);
+	rmesa->radeon.dma.current_vertexptr += bytes;
+	rmesa->swtcl.numverts += nverts;
+	return head;
 }
 
 static GLuint reduced_prim[] = {
@@ -550,9 +583,9 @@ static void r300RenderStart(GLcontext *ctx)
 	r300UpdateShaderStates(rmesa);
 
 	r300EmitCacheFlush(rmesa);
-    if (rmesa->swtcl.flush != NULL) {
-        rmesa->swtcl.flush(rmesa);
-    }
+	if (rmesa->radeon.dma.flush != NULL) {
+		rmesa->radeon.dma.flush(ctx);
+	}
 }
 
 static void r300RenderFinish(GLcontext *ctx)
diff --git a/src/mesa/drivers/dri/radeon/common_context.h b/src/mesa/drivers/dri/radeon/common_context.h
index 09a53b00eae..471e7cdfb17 100644
--- a/src/mesa/drivers/dri/radeon/common_context.h
+++ b/src/mesa/drivers/dri/radeon/common_context.h
@@ -225,16 +225,26 @@ struct radeon_aos {
 };
 
 struct radeon_dma {
-   /* Active dma region.  Allocations for vertices and retained
-    * regions come from here.  Also used for emitting random vertices,
-    * these may be flushed by calling flush_current();
-    */
-   struct radeon_dma_region current;
-   
-   void (*flush)( GLcontext *ctx );
-
-   char *buf0_address;		/* start of buf[0], for index calcs */
-   GLuint nr_released_bufs;	/* flush after so many buffers released */
+        /* Active dma region.  Allocations for vertices and retained
+         * regions come from here.  Also used for emitting random vertices,
+         * these may be flushed by calling flush_current();
+         */
+        struct radeon_bo *current; /** Buffer that DMA memory is allocated from */
+        int current_used; /** Number of bytes allocated and forgotten about */
+        int current_vertexptr; /** End of active vertex region */
+
+        /**
+         * If current_vertexptr != current_used then flush must be non-zero.
+         * flush must be called before non-active vertex allocations can be
+         * performed.
+         */
+        void (*flush) (GLcontext *);
+
+        /* Number of "in-flight" DMA buffers, i.e. the number of buffers
+         * for which a DISCARD command is currently queued in the command buffer
+.
+         */
+        GLuint nr_released_bufs;
 };
 
 struct radeon_ioctl {
@@ -266,6 +276,8 @@ static INLINE GLuint radeonPackColor(GLuint cpp,
 
 #define MAX_CMD_BUF_SZ (16*1024)
 
+#define MAX_DMA_BUF_SZ (64*1024)
+
 struct radeon_store {
 	GLuint statenr;
 	GLuint primnr;
@@ -354,6 +366,7 @@ struct radeon_context {
    int                   texture_depth;
    float                 initialMaxAnisotropy;
 
+  struct radeon_dma dma;
    /* Rasterization and vertex state:
     */
    GLuint TclFallback;
diff --git a/src/mesa/drivers/dri/radeon/common_misc.c b/src/mesa/drivers/dri/radeon/common_misc.c
index 99ca936dae3..3ed58815d33 100644
--- a/src/mesa/drivers/dri/radeon/common_misc.c
+++ b/src/mesa/drivers/dri/radeon/common_misc.c
@@ -1316,22 +1316,19 @@ void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
 	uint32_t *out;
 	uint32_t bo_size;
 
-	memset(aos, 0, sizeof(struct radeon_aos));
 	if (stride == 0) {
-		bo_size = size * 4;
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
 		count = 1;
 		aos->stride = 0;
 	} else {
-		bo_size = size * count * 4;
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * count * 4, 32);
 		aos->stride = size;
 	}
-	aos->bo = radeon_bo_open(rmesa->radeonScreen->bom,
-				 0, bo_size, 32, RADEON_GEM_DOMAIN_GTT, 0);
-	aos->offset = 0;
+
 	aos->components = size;
 	aos->count = count;
 
-	radeon_bo_map(aos->bo, 1);
+//	radeon_bo_map(aos->bo, 1);
 	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
 	switch (size) {
 	case 1: radeonEmitVec4(out, data, stride, count); break;
@@ -1342,7 +1339,7 @@ void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
 		assert(0);
 		break;
 	}
-	radeon_bo_unmap(aos->bo);
+//	radeon_bo_unmap(aos->bo);
 }
 
 
@@ -2321,3 +2318,75 @@ void radeonSpanRenderFinish(GLcontext * ctx)
 		unmap_buffer(ctx->DrawBuffer->_StencilBuffer->Wrapped);
 }
 
+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
+{
+	size = MAX2(size, MAX_DMA_BUF_SZ * 16);
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (rmesa->dma.flush) {
+		radeon_bo_unmap(rmesa->dma.current);
+		rmesa->dma.flush(rmesa->glCtx);
+	}
+
+
+
+	if (rmesa->dma.nr_released_bufs > 4) {
+		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
+		rmesa->dma.nr_released_bufs = 0;
+	}
+
+	if (rmesa->dma.current) {
+		radeon_bo_unref(rmesa->dma.current);
+		rmesa->dma.current = 0;
+	}
+	
+	rmesa->dma.current = radeon_bo_open(rmesa->radeonScreen->bom,
+					    0, size, 4, RADEON_GEM_DOMAIN_GTT,
+					    0);
+
+	rmesa->dma.current_used = 0;
+	rmesa->dma.current_vertexptr = 0;
+	radeon_bo_map(rmesa->dma.current, 1);
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void radeonAllocDmaRegion(radeonContextPtr rmesa,
+			  struct radeon_bo **pbo, int *poffset,
+			  int bytes, int alignment)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa->glCtx);
+
+	assert(rmesa->dma.current_used == rmesa->dma.current_vertexptr);
+
+	alignment--;
+	rmesa->dma.current_used = (rmesa->dma.current_used + alignment) & ~alignment;
+
+	if (!rmesa->dma.current || rmesa->dma.current_used + bytes > rmesa->dma.current->size)
+		radeonRefillCurrentDmaRegion(rmesa, (bytes + 15) & ~15);
+
+	*poffset = rmesa->dma.current_used;
+	*pbo = rmesa->dma.current;
+	radeon_bo_ref(*pbo);
+
+	/* Always align to at least 16 bytes */
+	rmesa->dma.current_used = (rmesa->dma.current_used + bytes + 15) & ~15;
+	rmesa->dma.current_vertexptr = rmesa->dma.current_used;
+
+	assert(rmesa->dma.current_used <= rmesa->dma.current->size);
+}
+
+void radeonReleaseDmaRegion(radeonContextPtr rmesa)
+{
+	rmesa->dma.nr_released_bufs++;
+	radeon_bo_unref(rmesa->dma.current);
+	rmesa->dma.current = NULL;
+}
+			    
diff --git a/src/mesa/drivers/dri/radeon/common_misc.h b/src/mesa/drivers/dri/radeon/common_misc.h
index aeff52a66e1..d17d1607db1 100644
--- a/src/mesa/drivers/dri/radeon/common_misc.h
+++ b/src/mesa/drivers/dri/radeon/common_misc.h
@@ -118,4 +118,8 @@ GLubyte *radeon_ptr16(const struct radeon_renderbuffer * rrb,
 		    GLint x, GLint y);
 GLubyte *radeon_ptr32(const struct radeon_renderbuffer * rrb,
 		    GLint x, GLint y);
+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size);
+void radeonAllocDmaRegion(radeonContextPtr rmesa,
+			  struct radeon_bo **pbo, int *poffset,
+			  int bytes, int alignment);
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
index 353f00100a0..bd126c026cc 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
@@ -77,6 +77,7 @@ struct bo_manager_legacy {
     uint32_t                    fb_location;
     uint32_t                    texture_offset;
     unsigned                    dma_alloc_size;
+    uint32_t                    dma_buf_count;
     unsigned                    cpendings;
     driTextureObject            texture_swapped;
     driTexHeap                  *texture_heap;
@@ -221,7 +222,7 @@ static int legacy_wait_pending(struct radeon_bo *bo)
     return 0;
 }
 
-static void legacy_track_pending(struct bo_manager_legacy *boml)
+static void legacy_track_pending(struct bo_manager_legacy *boml, int debug)
 {
     struct bo_legacy *bo_legacy;
     struct bo_legacy *next;
@@ -229,6 +230,9 @@ static void legacy_track_pending(struct bo_manager_legacy *boml)
     legacy_get_current_age(boml);
     bo_legacy = boml->pending_bos.pnext;
     while (bo_legacy) {
+        if (debug)
+	  fprintf(stderr,"pending %p %d %d %d\n", bo_legacy, bo_legacy->base.size,
+		  boml->current_age, bo_legacy->pending);
         next = bo_legacy->pnext;
         if (legacy_is_pending(&(bo_legacy->base))) {
         }
@@ -236,6 +240,19 @@ static void legacy_track_pending(struct bo_manager_legacy *boml)
     } 
 }
 
+static int legacy_wait_any_pending(struct bo_manager_legacy *boml)
+{
+    struct bo_legacy *bo_legacy;
+    struct bo_legacy *next;
+
+    legacy_get_current_age(boml);
+    bo_legacy = boml->pending_bos.pnext;
+    if (!bo_legacy)
+      return -1;
+    legacy_wait_pending(&bo_legacy->base);
+    return 0;
+}
+
 static struct bo_legacy *bo_allocate(struct bo_manager_legacy *boml,
                                      uint32_t size,
                                      uint32_t alignment,
@@ -292,13 +309,13 @@ static int bo_dma_alloc(struct radeon_bo *bo)
     if (r) {
         /* ptr is set to NULL if dma allocation failed */
         bo_legacy->ptr = NULL;
-        exit(0);
         return r;
     }
     bo_legacy->ptr = boml->screen->gartTextures.map + base_offset;
     bo_legacy->offset = boml->screen->gart_texture_offset + base_offset;
     bo->size = size;
     boml->dma_alloc_size += size;
+    boml->dma_buf_count++;
     return 0;
 }
 
@@ -328,6 +345,7 @@ static int bo_dma_free(struct radeon_bo *bo)
         return r;
     }
     boml->dma_alloc_size -= bo_legacy->base.size;
+    boml->dma_buf_count--;
     return 0;
 }
 
@@ -388,15 +406,20 @@ static struct radeon_bo *bo_open(struct radeon_bo_manager *bom,
         return NULL;
     }
     if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) {
-        legacy_track_pending(boml);
+    retry:
+        legacy_track_pending(boml, 0);
         /* dma buffers */
+
         r = bo_dma_alloc(&(bo_legacy->base));
         if (r) {
-            fprintf(stderr, "Ran out of GART memory (for %d)!\n", size);
+	  if (legacy_wait_any_pending(boml) == -1) {
+	    fprintf(stderr, "Ran out of GART memory (for %d)!\n", size);
             fprintf(stderr, "Please consider adjusting GARTSize option.\n");
             bo_free(bo_legacy);
             exit(-1);
-            return NULL;
+	  }
+	  goto retry;
+	  return NULL;
         }
     } else {
         bo_legacy->ptr = malloc(bo_legacy->base.size);
@@ -460,7 +483,6 @@ static int bo_map(struct radeon_bo *bo, int write)
         volatile int *buf = (int*)boml->screen->driScreen->pFB;
         p = *buf;
     }
-
     return 0;
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index b87275c56b8..a6a3b1178c6 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -318,7 +318,7 @@ radeonCreateContext( const __GLcontextModes *glVisual,
 				       screen->sarea_priv_offset);
 
 
-   rmesa->dma.buf0_address = rmesa->radeon.radeonScreen->buffers->list[0].address;
+   //rmesa->dma.buf0_address = rmesa->radeon.radeonScreen->buffers->list[0].address;
 
    (void) memset( rmesa->radeon.texture_heaps, 0, sizeof( rmesa->radeon.texture_heaps ) );
    make_empty_list( & rmesa->radeon.swapped );
@@ -522,8 +522,8 @@ void radeonDestroyContext( __DRIcontextPrivate *driContextPriv )
 
       radeonDestroySwtcl( rmesa->radeon.glCtx );
       radeonReleaseArrays( rmesa->radeon.glCtx, ~0 );
-      if (rmesa->dma.current.buf) {
-	 radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+      if (rmesa->radeon.dma.current) {
+	 radeonReleaseDmaRegion( rmesa, &rmesa->radeon.dma.current, __FUNCTION__ );
 	 radeonFlushCmdBuf( rmesa, __FUNCTION__ );
       }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index dedc3626040..ba5c57f1210 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -425,7 +425,6 @@ struct r100_context {
 	/* Vertex buffers
 	 */
 	struct radeon_ioctl ioctl;
-	struct radeon_dma dma;
 	struct radeon_store store;
 	/* A full state emit as of the first state emit in the main store, in case
 	 * the context is lost.
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
index cd0f90d748b..5ab19b2a8c2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
@@ -117,12 +117,12 @@ static void radeonBackUpAndEmitLostStateLocked( r100ContextPtr rmesa )
 
    rmesa->radeon.lost_context = GL_FALSE;
 
-   nr_released_bufs = rmesa->dma.nr_released_bufs;
+   nr_released_bufs = rmesa->radeon.dma.nr_released_bufs;
    saved_store = rmesa->store;
-   rmesa->dma.nr_released_bufs = 0;
+   rmesa->radeon.dma.nr_released_bufs = 0;
    rmesa->store = rmesa->backup_store;
    radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
-   rmesa->dma.nr_released_bufs = nr_released_bufs;
+   rmesa->radeon.dma.nr_released_bufs = nr_released_bufs;
    rmesa->store = saved_store;
 }
 
@@ -308,8 +308,8 @@ void radeonFlushElts( GLcontext *ctx )
    if (RADEON_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-   assert( rmesa->dma.flush == radeonFlushElts );
-   rmesa->dma.flush = NULL;
+   assert( rmesa->radeon.dma.flush == radeonFlushElts );
+   rmesa->radeon.dma.flush = NULL;
 
    /* Cope with odd number of elts:
     */
@@ -381,9 +381,9 @@ GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
 	      __FUNCTION__,
 	      cmd[1].i, vertex_format, primitive);
 
-   assert(!rmesa->dma.flush);
+   assert(!rmesa->radeon.dma.flush);
    rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-   rmesa->dma.flush = radeonFlushElts;
+   rmesa->radeon.dma.flush = radeonFlushElts;
 
    rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.h b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
index d11feb58043..b4bc9b11441 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
@@ -76,16 +76,6 @@ extern void radeonEmitBlit( r100ContextPtr rmesa,
 extern void radeonEmitWait( r100ContextPtr rmesa, GLuint flags );
 
 extern void radeonFlushCmdBuf( r100ContextPtr rmesa, const char * );
-extern void radeonRefillCurrentDmaRegion( r100ContextPtr rmesa );
-
-extern void radeonAllocDmaRegion( r100ContextPtr rmesa,
-				  struct radeon_dma_region *region,
-				  int bytes, 
-				  int alignment );
-
-extern void radeonReleaseDmaRegion( r100ContextPtr rmesa,
-				    struct radeon_dma_region *region,
-				    const char *caller );
 
 extern void radeonFlush( GLcontext *ctx );
 extern void radeonFinish( GLcontext *ctx );
@@ -101,8 +91,8 @@ extern void radeonSetUpAtomList( r100ContextPtr rmesa );
  */
 #define RADEON_NEWPRIM( rmesa )			\
 do {						\
-   if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa->radeon.glCtx );	\
+   if ( rmesa->radeon.dma.flush )			\
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
 } while (0)
 
 /* Can accomodate several state changes and primitive changes without
@@ -142,7 +132,7 @@ static INLINE int RADEON_DB_STATECHANGE(
  */
 #define RADEON_FIREVERTICES( rmesa )			\
 do {							\
-   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
+   if ( rmesa->store.cmd_used || rmesa->radeon.dma.flush ) {	\
       radeonFlush( rmesa->radeon.glCtx );			\
    }							\
 } while (0)
-- 
2.30.2