From 6f3cc6a5226fd4b5d44cca91e2f76216ecaff831 Mon Sep 17 00:00:00 2001
From: Eric Anholt <anholt@FreeBSD.org>
Date: Tue, 17 Aug 2004 01:41:29 +0000
Subject: [PATCH] Close some races with locking on R100 and R200 which could
 manifest as rendering errors on r100 and rendering errors and hangs on r200
 (same for R100 without OLD_PACKETS).

If a command buffer filled after some state (EmitState or a VBPNTR write) was
emitted, the lock was grabbed, the buffer flushed, a new buffer prepared, and
the lock dropped.  Another client could come in, set its own state as part of
rendering, and when the first client flushed the rendering commands depending
on the previous state, it got the 2nd client's state.  This is fixed by checking
for enough space before beginning a set of state emits and rendering, and
flushing the buffer first if so.  This guarantees that the buffer won't wrap.

Also, move the "lost_context = 1" from the end of cmdbuf flushing to
UNLOCK_HARDWARE for clarity (at a minimum) that any time the lock is dropped,
state may get overwritten.  We don't have enough information at the point of the
LOCK_HARDWARE to reset our state to the last UNLOCK_HARDWARE point in the case
that we did lose our context, but saving the information to rebuild that state
may be a useful optimization (ipers data suggests up to 5%).
---
 src/mesa/drivers/dri/r200/r200_cmdbuf.c       | 14 ++++----
 src/mesa/drivers/dri/r200/r200_context.h      |  2 ++
 src/mesa/drivers/dri/r200/r200_ioctl.c        |  3 --
 src/mesa/drivers/dri/r200/r200_ioctl.h        | 25 ++++++++++++++
 src/mesa/drivers/dri/r200/r200_lock.h         | 11 ++++++-
 src/mesa/drivers/dri/r200/r200_state_init.c   |  2 ++
 src/mesa/drivers/dri/r200/r200_swtcl.c        |  2 ++
 src/mesa/drivers/dri/r200/r200_tcl.c          |  6 ++++
 src/mesa/drivers/dri/radeon/radeon_context.h  |  2 ++
 src/mesa/drivers/dri/radeon/radeon_ioctl.c    | 27 ++++++---------
 src/mesa/drivers/dri/radeon/radeon_ioctl.h    | 33 +++++++++++++++++++
 src/mesa/drivers/dri/radeon/radeon_lock.h     | 11 ++++++-
 .../drivers/dri/radeon/radeon_state_init.c    |  2 ++
 src/mesa/drivers/dri/radeon/radeon_swtcl.c    |  2 ++
 src/mesa/drivers/dri/radeon/radeon_tcl.c      |  6 ++++
 15 files changed, 118 insertions(+), 30 deletions(-)

diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
index 26812d50b62..fa0c62385b0 100644
--- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
@@ -183,7 +183,7 @@ extern void r200EmitVbufPrim( r200ContextPtr rmesa,
       fprintf(stderr, "%s cmd_used/4: %d prim %x nr %d\n", __FUNCTION__,
 	      rmesa->store.cmd_used/4, primitive, vertex_nr);
    
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 3 * sizeof(*cmd),
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VBUF_BUFSZ,
 						  __FUNCTION__ );
    cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
@@ -236,8 +236,7 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
    
    r200EmitState( rmesa );
    
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 
-						12 + min_nr*2,
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, ELTS_BUFSZ(min_nr),
 						__FUNCTION__ );
    cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
@@ -275,7 +274,7 @@ void r200EmitVertexAOS( r200ContextPtr rmesa,
       fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
 	      __FUNCTION__, vertex_size, offset);
 
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 5 * sizeof(int),
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
 						  __FUNCTION__ );
 
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
@@ -292,18 +291,17 @@ void r200EmitAOS( r200ContextPtr rmesa,
 		    GLuint offset )
 {
    drm_radeon_cmd_header_t *cmd;
-   int sz = 3 + ((nr/2)*3) + ((nr&1)*2);
+   int sz = AOS_BUFSZ(nr);
    int i;
    int *tmp;
 
    if (R200_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "%s nr arrays: %d\n", __FUNCTION__, nr);
 
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sz * sizeof(int),
-						  __FUNCTION__ );
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sz, __FUNCTION__ );
    cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | ((sz-3) << 16);
+   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (((sz / sizeof(int)) - 3) << 16);
    cmd[2].i = nr;
    tmp = &cmd[0].i;
    cmd += 3;
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index 8f90c158315..f000e143308 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -528,6 +528,8 @@ struct r200_hw_state {
    struct r200_state_atom grd; /* guard band clipping */
    struct r200_state_atom fog; 
    struct r200_state_atom glt; 
+
+   int max_state_size;	/* Number of bytes necessary for a full state emit. */
 };
 
 struct r200_state {
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
index 54875d5d22b..5d084baf3e0 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
@@ -132,7 +132,6 @@ int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller )
    rmesa->store.statenr = 0;
    rmesa->store.cmd_used = 0;
    rmesa->dma.nr_released_bufs = 0;
-   rmesa->lost_context = 1;	
    return ret;
 }
 
@@ -564,8 +563,6 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask, GLboolean all,
 	 return;
    }
 
-   r200EmitState( rmesa );
-
    /* Need to cope with lostcontext here as kernel relies on
     * some residual state:
     */
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.h b/src/mesa/drivers/dri/r200/r200_ioctl.h
index 011288161ba..1503df70754 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.h
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.h
@@ -169,6 +169,31 @@ do {							\
    }							\
 } while (0)
 
+/* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
+ * are available, you will also be adding an rmesa->state.max_state_size because
+ * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
+ */
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define VERT_AOS_BUFSZ	(5 * sizeof(int))
+#define ELTS_BUFSZ(nr)	(12 + nr * 2)
+#define VBUF_BUFSZ	(3 * sizeof(int))
+
+/* Ensure that a minimum amount of space is available in the command buffer.
+ * This is used to ensure atomicity of state updates with the rendering requests
+ * that rely on them.
+ *
+ * An alternative would be to implement a "soft lock" such that when the buffer
+ * wraps at an inopportune time, we grab the lock, flush the current buffer,
+ * and hang on to the lock until the critical section is finished and we flush
+ * the buffer again and unlock.
+ */
+static __inline void r200EnsureCmdBufSpace( r200ContextPtr rmesa, int bytes )
+{
+   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
+      r200FlushCmdBuf( rmesa, __FUNCTION__ );
+   assert( bytes <= R200_CMD_BUF_SZ );
+}
+
 /* Alloc space in the command buffer
  */
 static __inline char *r200AllocCmdBuf( r200ContextPtr rmesa,
diff --git a/src/mesa/drivers/dri/r200/r200_lock.h b/src/mesa/drivers/dri/r200/r200_lock.h
index 7a4dfcf15cc..c913bd5f629 100644
--- a/src/mesa/drivers/dri/r200/r200_lock.h
+++ b/src/mesa/drivers/dri/r200/r200_lock.h
@@ -98,7 +98,15 @@ extern int prevLockLine;
       DEBUG_LOCK();						\
    } while (0)
 
-/* Unlock the hardware.
+/* Unlock the hardware.  We must assume that state has been lost when we unlock,
+ * because when we next grab the lock (to emit an accumulated cmdbuf), we don't
+ * have the information to recreate the context state as of the last unlock in
+ * in the case that we did lose the context state.
+ *
+ * The alternative to this would be to copy out the state on unlock
+ * (approximately) and if we did lose the context, dispatch a cmdbuf to reset
+ * the state to that old copy before continuing with the accumulated command
+ * buffer.
  */
 #define UNLOCK_HARDWARE( rmesa )					\
    do {									\
@@ -106,6 +114,7 @@ extern int prevLockLine;
 		  rmesa->dri.hwLock,					\
 		  rmesa->dri.hwContext );				\
       DEBUG_RESET();							\
+      rmesa->lost_context = GL_TRUE;					\
    } while (0)
 
 #endif
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index ed53c5d2cf0..3b6893aeee9 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -205,6 +205,7 @@ void r200InitState( r200ContextPtr rmesa )
    make_empty_list(&(rmesa->hw.dirty)); rmesa->hw.dirty.name = "DIRTY";
    make_empty_list(&(rmesa->hw.clean)); rmesa->hw.clean.name = "CLEAN";
 
+   rmesa->hw.max_state_size = 0;
 
 #define ALLOC_STATE( ATOM, CHK, SZ, NM, IDX )				\
    do {								\
@@ -215,6 +216,7 @@ void r200InitState( r200ContextPtr rmesa )
       rmesa->hw.ATOM.idx = IDX;					\
       rmesa->hw.ATOM.check = check_##CHK;				\
       insert_at_head(&(rmesa->hw.dirty), &(rmesa->hw.ATOM));	\
+      rmesa->hw.max_state_size += SZ * sizeof(int);		\
    } while (0)
       
       
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index d694ff1b9bc..66662bb4d03 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -260,6 +260,8 @@ static void flush_last_swtcl_prim( r200ContextPtr rmesa  )
 	      current->ptr);
 
       if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+	 r200EnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
+			        rmesa->hw.max_state_size + VBUF_BUFSZ );
 	 r200EmitVertexAOS( rmesa,
 			      rmesa->swtcl.vertex_size,
 			      current_offset);
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index 85f4bc1f7dc..b613911b06d 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -143,6 +143,9 @@ static GLushort *r200AllocElts( r200ContextPtr rmesa, GLuint nr )
    if (rmesa->dma.flush)
       rmesa->dma.flush( rmesa );
 
+   r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			  rmesa->hw.max_state_size + ELTS_BUFSZ(nr) );
+   
    r200EmitAOS( rmesa,
 		rmesa->tcl.aos_components,
 		rmesa->tcl.nr_aos_components, 0 );
@@ -167,6 +170,9 @@ static void EMIT_PRIM( GLcontext *ctx,
    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    r200TclPrimitive( ctx, prim, hwprim );
    
+   r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			  rmesa->hw.max_state_size + VBUF_BUFSZ );
+
    r200EmitAOS( rmesa,
 		  rmesa->tcl.aos_components,
 		  rmesa->tcl.nr_aos_components,
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index c8f74272ae0..03392ee6754 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -426,6 +426,8 @@ struct radeon_hw_state {
    struct radeon_state_atom fog; 
    struct radeon_state_atom glt; 
    struct radeon_state_atom txr[2]; /* for NPOT */
+
+   int max_state_size;	/* Number of bytes necessary for a full state emit. */
 };
 
 struct radeon_state {
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
index c5707a02807..3cb7dca2150 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
@@ -204,9 +204,10 @@ extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
       fprintf(stderr, "%s cmd_used/4: %d\n", __FUNCTION__,
 	      rmesa->store.cmd_used/4);
    
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VBUF_BUFSZ,
+						       __FUNCTION__ );
 #if RADEON_OLD_PACKETS
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 6 * sizeof(*cmd),
-						  __FUNCTION__ );
+   cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
    cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM | (3 << 16);
    cmd[2].i = rmesa->ioctl.vertex_offset;
@@ -223,8 +224,6 @@ extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
 	      __FUNCTION__,
 	      cmd[1].i, cmd[2].i, cmd[4].i, cmd[5].i);
 #else
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 4 * sizeof(*cmd),
-						  __FUNCTION__ );
    cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
    cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_VBUF | (1 << 16);
@@ -291,10 +290,10 @@ GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
    
    radeonEmitState( rmesa );
    
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa,
+						       ELTS_BUFSZ(min_nr),
+						       __FUNCTION__ );
 #if RADEON_OLD_PACKETS
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 
-						  24 + min_nr*2,
-						  __FUNCTION__ );
    cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
    cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM;
@@ -308,9 +307,6 @@ GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
 
    retval = (GLushort *)(cmd+6);
 #else   
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 
-						  16 + min_nr*2,
-						  __FUNCTION__ );
    cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
    cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_INDX;
@@ -354,7 +350,7 @@ void radeonEmitVertexAOS( radeonContextPtr rmesa,
       fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
 	      __FUNCTION__, vertex_size, offset);
 
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 5 * sizeof(int),
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
 						  __FUNCTION__ );
 
    cmd[0].i = 0;
@@ -380,7 +376,7 @@ void radeonEmitAOS( radeonContextPtr rmesa,
       (component[0]->aos_start + offset * component[0]->aos_stride * 4);
 #else
    drm_radeon_cmd_header_t *cmd;
-   int sz = 3 + (nr/2 * 3) + (nr & 1) * 2;
+   int sz = AOS_BUFSZ;
    int i;
    int *tmp;
 
@@ -388,11 +384,11 @@ void radeonEmitAOS( radeonContextPtr rmesa,
       fprintf(stderr, "%s\n", __FUNCTION__);
 
 
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sz * sizeof(int),
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sz,
 						  __FUNCTION__ );
    cmd[0].i = 0;
    cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | ((sz-3) << 16);
+   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (((sz / sizeof(int))-3) << 16);
    cmd[2].i = nr;
    tmp = &cmd[0].i;
    cmd += 3;
@@ -548,7 +544,6 @@ static int radeonFlushCmdBufLocked( radeonContextPtr rmesa,
    rmesa->store.statenr = 0;
    rmesa->store.cmd_used = 0;
    rmesa->dma.nr_released_bufs = 0;
-   rmesa->lost_context = 1;	
    return ret;
 }
 
@@ -982,8 +977,6 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask, GLboolean all,
 	       __FUNCTION__, all, cx, cy, cw, ch );
    }
 
-   radeonEmitState( rmesa );
-
    /* Need to cope with lostcontext here as kernel relies on
     * some residual state:
     */
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.h b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
index 3f6e1751cf6..695eb5770be 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
@@ -164,6 +164,39 @@ do {							\
    }							\
 } while (0)
 
+/* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
+ * are available, you will also be adding an rmesa->state.max_state_size because
+ * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
+ */
+#if RADEON_OLD_PACKETS
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define VERT_AOS_BUFSZ	(0)
+#define ELTS_BUFSZ(nr)	(24 + nr * 2)
+#define VBUF_BUFSZ	(6 * sizeof(int))
+#else
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define VERT_AOS_BUFSZ	(5 * sizeof(int))
+#define ELTS_BUFSZ(nr)	(16 + nr * 2)
+#define VBUF_BUFSZ	(4 * sizeof(int))
+#endif
+
+/* Ensure that a minimum amount of space is available in the command buffer.
+ * This is used to ensure atomicity of state updates with the rendering requests
+ * that rely on them.
+ *
+ * An alternative would be to implement a "soft lock" such that when the buffer
+ * wraps at an inopportune time, we grab the lock, flush the current buffer,
+ * and hang on to the lock until the critical section is finished and we flush
+ * the buffer again and unlock.
+ */
+static __inline void radeonEnsureCmdBufSpace( radeonContextPtr rmesa,
+					      int bytes )
+{
+   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
+      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+   assert( bytes <= RADEON_CMD_BUF_SZ );
+}
+
 /* Alloc space in the command buffer
  */
 static __inline char *radeonAllocCmdBuf( radeonContextPtr rmesa,
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.h b/src/mesa/drivers/dri/radeon/radeon_lock.h
index 783db7e92a8..e18a642088a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.h
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.h
@@ -99,7 +99,15 @@ extern int prevLockLine;
       DEBUG_LOCK();						\
    } while (0)
 
-/* Unlock the hardware.
+/* Unlock the hardware.  We must assume that state has been lost when we unlock,
+ * because when we next grab the lock (to emit an accumulated cmdbuf), we don't
+ * have the information to recreate the context state as of the last unlock in
+ * in the case that we did lose the context state.
+ *
+ * The alternative to this would be to copy out the state on unlock
+ * (approximately) and if we did lose the context, dispatch a cmdbuf to reset
+ * the state to that old copy before continuing with the accumulated command
+ * buffer.
  */
 #define UNLOCK_HARDWARE( rmesa )					\
    do {									\
@@ -107,6 +115,7 @@ extern int prevLockLine;
 		  rmesa->dri.hwLock,					\
 		  rmesa->dri.hwContext );				\
       DEBUG_RESET();							\
+      rmesa->lost_context = GL_TRUE;					\
    } while (0)
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index f842e430e1b..7b2f1470705 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -202,6 +202,7 @@ void radeonInitState( radeonContextPtr rmesa )
    make_empty_list(&(rmesa->hw.dirty));
    make_empty_list(&(rmesa->hw.clean));
 
+   rmesa->hw.max_state_size = 0;
 
 #define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )				\
    do {								\
@@ -212,6 +213,7 @@ void radeonInitState( radeonContextPtr rmesa )
       rmesa->hw.ATOM.is_tcl = FLAG;					\
       rmesa->hw.ATOM.check = check_##CHK;				\
       insert_at_head(&(rmesa->hw.dirty), &(rmesa->hw.ATOM));	\
+      rmesa->hw.max_state_size += SZ * sizeof(int);		\
    } while (0)
       
       
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index ade97daef21..40a61e10add 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -380,6 +380,8 @@ static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
 	      current->ptr);
 
       if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+	 radeonEnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
+			          rmesa->hw.max_state_size + VBUF_BUFSZ );
 	 radeonEmitVertexAOS( rmesa,
 			      rmesa->swtcl.vertex_size,
 			      current_offset);
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
index 9aec22b1a52..6e294d2395c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
@@ -149,6 +149,9 @@ static GLushort *radeonAllocElts( radeonContextPtr rmesa, GLuint nr )
    if (rmesa->dma.flush)
       rmesa->dma.flush( rmesa );
 
+   radeonEnsureCmdBufSpace(rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			   rmesa->hw.max_state_size + ELTS_BUFSZ(nr));
+
    radeonEmitAOS( rmesa,
 		rmesa->tcl.aos_components,
 		rmesa->tcl.nr_aos_components, 0 );
@@ -175,6 +178,9 @@ static void EMIT_PRIM( GLcontext *ctx,
    radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
    radeonTclPrimitive( ctx, prim, hwprim );
    
+   radeonEnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			    rmesa->hw.max_state_size + VBUF_BUFSZ );
+
    radeonEmitAOS( rmesa,
 		  rmesa->tcl.aos_components,
 		  rmesa->tcl.nr_aos_components,
-- 
2.30.2