Convert all uses of CARD32 and CARD8 to int32_t and int8_t.
[mesa.git] / src / mesa / drivers / dri / radeon / radeon_swtcl.c
index 926b1523d62c3e0597b200c5c984e6707dc9206f..fd14e76d4441a3218e1fc4daa46ea1aef9e07ac9 100644 (file)
@@ -1,4 +1,4 @@
-/* $XFree86$ */
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_swtcl.c,v 1.6 2003/05/06 23:52:08 daenzer Exp $ */
 /**************************************************************************
 
 Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
@@ -44,8 +44,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "math/m_translate.h"
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
-#include "tnl/t_imm_exec.h"
 #include "tnl/t_pipeline.h"
+#include "tnl/t_vtx_api.h"     /* for _tnl_FlushVertices */
 
 #include "radeon_context.h"
 #include "radeon_ioctl.h"
@@ -67,7 +67,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define RADEON_MAX_SETUP       0x40
 
 static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
-static void flush_last_swtcl_prim_compat( radeonContextPtr rmesa );
 
 static struct {
    void                (*emit)( GLcontext *, GLuint, GLuint, void *, GLuint );
@@ -75,7 +74,6 @@ static struct {
    copy_pv_func                copy_pv;
    GLboolean           (*check_tex_sizes)( GLcontext *ctx );
    GLuint               vertex_size;
-   GLuint               vertex_stride_shift;
    GLuint               vertex_format;
 } setup_tab[RADEON_MAX_SETUP];
 
@@ -135,9 +133,7 @@ static struct {
 #define GET_TEXSOURCE(n)  n
 #define GET_VERTEX_FORMAT() RADEON_CONTEXT(ctx)->swtcl.vertex_format
 #define GET_VERTEX_STORE() RADEON_CONTEXT(ctx)->swtcl.verts
-#define GET_VERTEX_STRIDE_SHIFT() RADEON_CONTEXT(ctx)->swtcl.vertex_stride_shift
-#define GET_UBYTE_COLOR_STORE() &RADEON_CONTEXT(ctx)->UbyteColor
-#define GET_UBYTE_SPEC_COLOR_STORE() &RADEON_CONTEXT(ctx)->UbyteSecondaryColor
+#define GET_VERTEX_SIZE() RADEON_CONTEXT(ctx)->swtcl.vertex_size * sizeof(GLuint)
 
 #define HAVE_HW_VIEWPORT    1
 /* Tiny vertices don't seem to work atm - haven't looked into why.
@@ -155,10 +151,6 @@ static struct {
 #define CHECK_HW_DIVIDE    (!(ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE| \
                                                     DD_TRI_UNFILLED)))
 
-#define IMPORT_QUALIFIER
-#define IMPORT_FLOAT_COLORS radeon_import_float_colors
-#define IMPORT_FLOAT_SPEC_COLORS radeon_import_float_spec_colors
-
 #define INTERP_VERTEX setup_tab[RADEON_CONTEXT(ctx)->swtcl.SetupIndex].interp
 #define COPY_PV_VERTEX setup_tab[RADEON_CONTEXT(ctx)->swtcl.SetupIndex].copy_pv
 
@@ -272,7 +264,6 @@ static void radeonRenderStart( GLcontext *ctx )
         RADEON_NEWPRIM(rmesa);
         rmesa->swtcl.vertex_format = setup_tab[ind].vertex_format;
         rmesa->swtcl.vertex_size = setup_tab[ind].vertex_size;
-        rmesa->swtcl.vertex_stride_shift = setup_tab[ind].vertex_stride_shift;
       }
 
       if (!(ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
@@ -282,7 +273,6 @@ static void radeonRenderStart( GLcontext *ctx )
    }
    
    if (rmesa->dma.flush != 0 && 
-       rmesa->dma.flush != flush_last_swtcl_prim_compat &&
        rmesa->dma.flush != flush_last_swtcl_prim)
       rmesa->dma.flush( rmesa );
 }
@@ -292,9 +282,8 @@ void radeonBuildVertices( GLcontext *ctx, GLuint start, GLuint count,
                           GLuint newinputs )
 {
    radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-   GLubyte *v = ((GLubyte *)rmesa->swtcl.verts + 
-                (start << rmesa->swtcl.vertex_stride_shift));
-   GLuint stride = 1 << rmesa->swtcl.vertex_stride_shift;
+   GLuint stride = rmesa->swtcl.vertex_size * sizeof(int);
+   GLubyte *v = ((GLubyte *)rmesa->swtcl.verts + (start * stride));
 
    newinputs |= rmesa->swtcl.SetupNewInputs;
    rmesa->swtcl.SetupNewInputs = 0;
@@ -339,7 +328,6 @@ void radeonChooseVertexState( GLcontext *ctx )
       RADEON_NEWPRIM(rmesa);
       rmesa->swtcl.vertex_format = setup_tab[ind].vertex_format;
       rmesa->swtcl.vertex_size = setup_tab[ind].vertex_size;
-      rmesa->swtcl.vertex_stride_shift = setup_tab[ind].vertex_stride_shift;
    }
 
    {
@@ -381,7 +369,7 @@ static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
 
    if (rmesa->dma.current.buf) {
       struct radeon_dma_region *current = &rmesa->dma.current;
-      GLuint current_offset = (rmesa->radeonScreen->agp_buffer_offset +
+      GLuint current_offset = (rmesa->radeonScreen->gart_buffer_offset +
                               current->buf->buf->idx * RADEON_BUFFER_SIZE + 
                               current->start);
 
@@ -408,46 +396,6 @@ static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
 }
 
 
-static void flush_last_swtcl_prim_compat( radeonContextPtr rmesa )
-{
-   struct radeon_dma_region *current = &rmesa->dma.current;
-
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s buf %p start %d ptr %d\n", 
-             __FUNCTION__,
-             current->buf,
-             current->start,
-             current->ptr);
-
-   assert (!(rmesa->swtcl.hw_primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
-   assert (current->start + 
-          rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-          current->ptr);
-   assert (current->start == 0);
-
-   rmesa->dma.flush = 0;
-
-   if (current->ptr && current->buf) {
-      assert (current->buf->refcount == 1);
-
-      radeonCompatEmitPrimitive( rmesa,
-                                rmesa->swtcl.vertex_format,
-                                rmesa->swtcl.hw_primitive,
-                                rmesa->swtcl.numverts);
-      
-      /* The buffer has been released:
-       */
-      FREE(current->buf);
-      current->buf = 0;
-      current->start = 0;
-      current->ptr = current->end;
-
-   }
-
-   rmesa->swtcl.numverts = 0;
-}
-
-
 /* Alloc space in the current dma region.
  */
 static __inline void *radeonAllocDmaLowVerts( radeonContextPtr rmesa,
@@ -460,22 +408,18 @@ static __inline void *radeonAllocDmaLowVerts( radeonContextPtr rmesa,
 
    if (!rmesa->dma.flush) {
       rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-      if (rmesa->dri.drmMinor == 1)
-        rmesa->dma.flush = flush_last_swtcl_prim_compat;
-      else
-        rmesa->dma.flush = flush_last_swtcl_prim;
+      rmesa->dma.flush = flush_last_swtcl_prim;
    }
 
    assert( vsize == rmesa->swtcl.vertex_size * 4 );
-   assert( rmesa->dma.flush == flush_last_swtcl_prim ||
-          rmesa->dma.flush == flush_last_swtcl_prim_compat);
+   assert( rmesa->dma.flush == flush_last_swtcl_prim );
    assert (rmesa->dma.current.start + 
           rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
           rmesa->dma.current.ptr);
 
 
    {
-      char *head = rmesa->dma.current.address + rmesa->dma.current.ptr;
+      GLubyte *head = (GLubyte *)(rmesa->dma.current.address + rmesa->dma.current.ptr);
       rmesa->dma.current.ptr += bytes;
       rmesa->swtcl.numverts += nverts;
       return head;
@@ -486,13 +430,15 @@ static __inline void *radeonAllocDmaLowVerts( radeonContextPtr rmesa,
 
 
 
-void radeon_emit_contiguous_verts( GLcontext *ctx, GLuint start, GLuint count )
+static void *radeon_emit_contiguous_verts( GLcontext *ctx, 
+                                          GLuint start, 
+                                          GLuint count,
+                                          void *dest)
 {
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint vertex_size = rmesa->swtcl.vertex_size * 4;
-   CARD32 *dest = radeonAllocDmaLowVerts( rmesa, count-start, vertex_size );
-   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, dest, 
-                                           vertex_size );
+   GLuint stride = rmesa->swtcl.vertex_size * 4;
+   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, dest, stride );
+   return (void *)((char *)dest + stride * (count - start));
 }
 
 
@@ -558,37 +504,13 @@ static __inline void radeonEltPrimitive( radeonContextPtr rmesa, GLenum prim )
 }
 
 
-static void VERT_FALLBACK( GLcontext *ctx,
-                          GLuint start,
-                          GLuint count,
-                          GLuint flags )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   tnl->Driver.Render.PrimitiveNotify( ctx, flags & PRIM_MODE_MASK );
-   tnl->Driver.Render.BuildVertices( ctx, start, count, ~0 );
-   tnl->Driver.Render.PrimTabVerts[flags&PRIM_MODE_MASK]( ctx, start, count, flags );
-   RADEON_CONTEXT(ctx)->swtcl.SetupNewInputs = VERT_BIT_CLIP;
-}
-
-static void ELT_FALLBACK( GLcontext *ctx,
-                         GLuint start,
-                         GLuint count,
-                         GLuint flags )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   tnl->Driver.Render.PrimitiveNotify( ctx, flags & PRIM_MODE_MASK );
-   tnl->Driver.Render.BuildVertices( ctx, start, count, ~0 );
-   tnl->Driver.Render.PrimTabElts[flags&PRIM_MODE_MASK]( ctx, start, count, flags );
-   RADEON_CONTEXT(ctx)->swtcl.SetupNewInputs = VERT_BIT_CLIP;
-}
 
 
 #define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
-#define ELTS_VARS  GLushort *dest
+#define ELTS_VARS( buf )  GLushort *dest = buf
 #define INIT( prim ) radeonDmaPrimitive( rmesa, prim )
 #define ELT_INIT(prim) radeonEltPrimitive( rmesa, prim )
-#define NEW_PRIMITIVE()  RADEON_NEWPRIM( rmesa )
-#define NEW_BUFFER()  radeonRefillCurrentDmaRegion( rmesa )
+#define FLUSH()  RADEON_NEWPRIM( rmesa )
 #define GET_CURRENT_VB_MAX_VERTS() \
   (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
 #define GET_SUBSEQUENT_VB_MAX_VERTS() \
@@ -605,38 +527,35 @@ static void ELT_FALLBACK( GLcontext *ctx,
   ((RADEON_CMD_BUF_SZ - 1024) / 2)
 
 
+static void *radeon_alloc_elts( radeonContextPtr rmesa, int nr )
+{
+   if (rmesa->dma.flush == radeonFlushElts &&
+       rmesa->store.cmd_used + nr*2 < RADEON_CMD_BUF_SZ) {
 
-/* How do you extend an existing primitive?
- */
-#define ALLOC_ELTS(nr)                                                 \
-do {                                                                   \
-   if (rmesa->dma.flush == radeonFlushElts &&                          \
-       rmesa->store.cmd_used + nr*2 < RADEON_CMD_BUF_SZ) {             \
-                                                                       \
-      dest = (GLushort *)(rmesa->store.cmd_buf +                       \
-                         rmesa->store.cmd_used);                       \
-      rmesa->store.cmd_used += nr*2;                                   \
-   }                                                                   \
-   else {                                                              \
-      if (rmesa->dma.flush) {                                          \
-        rmesa->dma.flush( rmesa );                                     \
-      }                                                                        \
-                                                                       \
-      radeonEmitVertexAOS( rmesa,                                      \
-                          rmesa->swtcl.vertex_size,                    \
-                          (rmesa->radeonScreen->agp_buffer_offset +            \
-                           rmesa->swtcl.indexed_verts.buf->buf->idx *  \
-                           RADEON_BUFFER_SIZE +                        \
-                           rmesa->swtcl.indexed_verts.start));         \
-                                                                       \
-      dest = radeonAllocEltsOpenEnded( rmesa,                          \
-                                      rmesa->swtcl.vertex_format,      \
-                                      rmesa->swtcl.hw_primitive,       \
-                                      nr );                            \
-   }                                                                   \
-} while (0)
+      rmesa->store.cmd_used += nr*2;
+
+      return (void *)(rmesa->store.cmd_buf + rmesa->store.cmd_used);
+   }
+   else {
+      if (rmesa->dma.flush) {
+        rmesa->dma.flush( rmesa );
+      }
 
-#define ALLOC_ELTS_NEW_PRIMITIVE(nr) ALLOC_ELTS( nr )
+      radeonEmitVertexAOS( rmesa,
+                          rmesa->swtcl.vertex_size,
+                          (rmesa->radeonScreen->gart_buffer_offset +
+                           rmesa->swtcl.indexed_verts.buf->buf->idx *
+                           RADEON_BUFFER_SIZE +
+                           rmesa->swtcl.indexed_verts.start));
+
+      return (void *) radeonAllocEltsOpenEnded( rmesa,
+                                               rmesa->swtcl.vertex_format,
+                                               rmesa->swtcl.hw_primitive,
+                                               nr );
+   }
+}
+
+#define ALLOC_ELTS(nr) radeon_alloc_elts(rmesa, nr)
 
 #ifdef MESA_BIG_ENDIAN
 /* We could do without (most of) this ugliness if dest was always 32 bit word aligned... */
@@ -649,14 +568,18 @@ do {                                                                      \
 #endif
 #define EMIT_TWO_ELTS(offset, x, y)  *(GLuint *)(dest+offset) = ((y)<<16)|(x);
 #define INCR_ELTS( nr ) dest += nr
+#define ELTPTR dest
 #define RELEASE_ELT_VERTS() \
   radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ )
-#define EMIT_VERTS( ctx, j, nr ) \
-  radeon_emit_contiguous_verts(ctx, j, (j)+(nr))
 #define EMIT_INDEXED_VERTS( ctx, start, count ) \
   radeon_emit_indexed_verts( ctx, start, count )
 
 
+#define ALLOC_VERTS( nr ) \
+  radeonAllocDmaLowVerts( rmesa, nr, rmesa->swtcl.vertex_size * 4 )
+#define EMIT_VERTS( ctx, j, nr, buf ) \
+  radeon_emit_contiguous_verts(ctx, j, (j)+(nr), buf)
+
 #define TAG(x) radeon_dma_##x
 #include "tnl_dd/t_dd_dmatmp.h"
 
@@ -667,52 +590,48 @@ do {                                                                      \
 
 
 static GLboolean radeon_run_render( GLcontext *ctx,
-                                   struct gl_pipeline_stage *stage )
+                                   struct tnl_pipeline_stage *stage )
 {
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
-   GLuint i, length, flags = 0;
    render_func *tab = TAG(render_tab_verts);
+   GLuint i;
 
    if (rmesa->swtcl.indexed_verts.buf && (!VB->Elts || stage->changed_inputs)) 
       RELEASE_ELT_VERTS();
        
-   if (VB->ClipOrMask ||            /* No clipping */
-       rmesa->swtcl.RenderIndex != 0 ||    /* No per-vertex manipulations */
-       ctx->Line.StippleFlag)        /* GH: THIS IS A HACK!!! */
+   if (rmesa->swtcl.RenderIndex != 0 ||   
+       !radeon_dma_validate_render( ctx, VB ))
       return GL_TRUE;          
 
-   if (rmesa->dri.drmMinor < 3) {
-      /* drm 1.1 doesn't support vertex primitives starting in the
-       * middle of a buffer.  It doesn't support sane indexed vertices
-       * either.  drm 1.2 fixes both of these problems, but we don't have a
-       * compatibility layer to that version yet.  
-       */
-      return GL_TRUE;
-   }
-
    tnl->Driver.Render.Start( ctx );
 
    if (VB->Elts) {
       tab = TAG(render_tab_elts);
-      if (!rmesa->swtcl.indexed_verts.buf)
-        if (!TAG(emit_elt_verts)(ctx, 0, VB->Count))
-           return GL_TRUE;     /* too many vertices */
+      if (!rmesa->swtcl.indexed_verts.buf) {
+        if (VB->Count > GET_SUBSEQUENT_VB_MAX_VERTS())
+           return GL_TRUE;
+        EMIT_INDEXED_VERTS(ctx, 0, VB->Count);
+      }
    }
 
-   for (i = 0 ; !(flags & PRIM_LAST) ; i += length)
+   for (i = 0 ; i < VB->PrimitiveCount ; i++)
    {
-      flags = VB->Primitive[i];
-      length = VB->PrimitiveLength[i];
+      GLuint prim = VB->Primitive[i].mode;
+      GLuint start = VB->Primitive[i].start;
+      GLuint length = VB->Primitive[i].count;
+
+      if (!length)
+        continue;
 
       if (RADEON_DEBUG & DEBUG_PRIMS)
         fprintf(stderr, "radeon_render.c: prim %s %d..%d\n", 
-                _mesa_lookup_enum_by_nr(flags & PRIM_MODE_MASK), 
-                i, i+length);
+                _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+                start, start+length);
 
       if (length)
-        tab[flags & PRIM_MODE_MASK]( ctx, i, i + length, flags );
+        tab[prim & PRIM_MODE_MASK]( ctx, start, start + length, prim );
    }
 
    tnl->Driver.Render.Finish( ctx );
@@ -723,35 +642,19 @@ static GLboolean radeon_run_render( GLcontext *ctx,
 
 
 static void radeon_check_render( GLcontext *ctx,
-                                struct gl_pipeline_stage *stage )
+                                struct tnl_pipeline_stage *stage )
 {
-   GLuint inputs = VERT_BIT_POS | VERT_BIT_CLIP | VERT_BIT_COLOR0;
-
-   if (ctx->RenderMode == GL_RENDER) {
-      if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR)
-        inputs |= VERT_BIT_COLOR1;
-
-      if (ctx->Texture.Unit[0]._ReallyEnabled)
-        inputs |= VERT_BIT_TEX0;
-
-      if (ctx->Texture.Unit[1]._ReallyEnabled)
-        inputs |= VERT_BIT_TEX1;
-
-      if (ctx->Fog.Enabled)
-        inputs |= VERT_BIT_FOG;
-   }
-
-   stage->inputs = inputs;
+   stage->inputs = TNL_CONTEXT(ctx)->render_inputs;
 }
 
 
-static void dtr( struct gl_pipeline_stage *stage )
+static void dtr( struct tnl_pipeline_stage *stage )
 {
    (void)stage;
 }
 
 
-const struct gl_pipeline_stage _radeon_render_stage =
+const struct tnl_pipeline_stage _radeon_render_stage =
 {
    "radeon render",
    (_DD_NEW_SEPARATE_SPECULAR |
@@ -784,7 +687,7 @@ struct texrect_stage_data {
 
 
 static GLboolean run_texrect_stage( GLcontext *ctx,
-                                   struct gl_pipeline_stage *stage )
+                                   struct tnl_pipeline_stage *stage )
 {
    struct texrect_stage_data *store = TEXRECT_STAGE_DATA(stage);
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
@@ -801,7 +704,7 @@ static GLboolean run_texrect_stage( GLcontext *ctx,
    
       if (stage->changed_inputs & VERT_BIT_TEX(i)) {
         struct gl_texture_object *texObj = ctx->Texture.Unit[i].CurrentRect;
-        struct gl_texture_image *texImage = texObj->Image[texObj->BaseLevel];
+        struct gl_texture_image *texImage = texObj->Image[0][texObj->BaseLevel];
         const GLfloat iw = 1.0/texImage->Width;
         const GLfloat ih = 1.0/texImage->Height;
         GLfloat *in = (GLfloat *)VB->TexCoordPtr[i]->data;
@@ -826,7 +729,7 @@ static GLboolean run_texrect_stage( GLcontext *ctx,
 /* Called the first time stage->run() is invoked.
  */
 static GLboolean alloc_texrect_data( GLcontext *ctx,
-                                    struct gl_pipeline_stage *stage )
+                                    struct tnl_pipeline_stage *stage )
 {
    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
    struct texrect_stage_data *store;
@@ -848,7 +751,7 @@ static GLboolean alloc_texrect_data( GLcontext *ctx,
 
 
 static void check_texrect( GLcontext *ctx,
-                          struct gl_pipeline_stage *stage )
+                          struct tnl_pipeline_stage *stage )
 {
    GLuint flags = 0;
 
@@ -864,7 +767,7 @@ static void check_texrect( GLcontext *ctx,
 }
 
 
-static void free_texrect_data( struct gl_pipeline_stage *stage )
+static void free_texrect_data( struct tnl_pipeline_stage *stage )
 {
    struct texrect_stage_data *store = TEXRECT_STAGE_DATA(stage);
    GLuint i;
@@ -879,7 +782,7 @@ static void free_texrect_data( struct gl_pipeline_stage *stage )
 }
 
 
-const struct gl_pipeline_stage _radeon_texrect_stage =
+const struct tnl_pipeline_stage _radeon_texrect_stage =
 {
    "radeon texrect stage",                     /* name */
    _NEW_TEXTURE,       /* check_state */
@@ -921,6 +824,7 @@ static void radeonResetLineStipple( GLcontext *ctx );
  ***********************************************************************/
 
 #undef LOCAL_VARS
+#undef ALLOC_VERTS
 #define CTX_ARG radeonContextPtr rmesa
 #define CTX_ARG2 rmesa
 #define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
@@ -928,9 +832,8 @@ static void radeonResetLineStipple( GLcontext *ctx );
 #undef LOCAL_VARS
 #define LOCAL_VARS                                             \
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);               \
-   const GLuint shift = rmesa->swtcl.vertex_stride_shift;      \
    const char *radeonverts = (char *)rmesa->swtcl.verts;
-#define VERT(x) (radeonVertex *)(radeonverts + (x << shift))
+#define VERT(x) (radeonVertex *)(radeonverts + (x * vertsize * sizeof(int)))
 #define VERTEX radeonVertex 
 #undef TAG
 #define TAG(x) radeon_##x
@@ -952,7 +855,6 @@ static void radeonResetLineStipple( GLcontext *ctx );
 
 #define RADEON_TWOSIDE_BIT     0x01
 #define RADEON_UNFILLED_BIT    0x02
-#define RADEON_OFFSET_BIT      0x04 /* drmMinor == 1 */
 #define RADEON_MAX_TRIFUNC     0x08
 
 
@@ -965,7 +867,7 @@ static struct {
 
 
 #define DO_FALLBACK  0
-#define DO_OFFSET   (IND & RADEON_OFFSET_BIT)
+#define DO_OFFSET    0
 #define DO_UNFILLED (IND & RADEON_UNFILLED_BIT)
 #define DO_TWOSIDE  (IND & RADEON_TWOSIDE_BIT)
 #define DO_FLAT      0
@@ -989,23 +891,43 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e<<rmesa->swtcl.vertex_stride_shift))
+#define GET_VERTEX(e) (rmesa->swtcl.verts + (e * rmesa->swtcl.vertex_size * sizeof(int)))
+
+#define VERT_SET_RGBA( v, c )                                          \
+do {                                                           \
+   radeon_color_t *color = (radeon_color_t *)&((v)->ui[coloroffset]);  \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);               \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);             \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);              \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]);             \
+} while (0)
 
-#define VERT_SET_RGBA( v, c )    v->ui[coloroffset] = LE32_TO_CPU(*(GLuint *)c)
 #define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
-#define VERT_SAVE_RGBA( idx )    color[idx] = CPU_TO_LE32(v[idx]->ui[coloroffset])
-#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = LE32_TO_CPU(color[idx])
-
-#define VERT_SET_SPEC( v0, c )   if (havespec) {                       \
-                                       v0->v.specular.red   = (c)[0];  \
-                                       v0->v.specular.green = (c)[1];  \
-                                       v0->v.specular.blue  = (c)[2]; }
-#define VERT_COPY_SPEC( v0, v1 ) if (havespec) {                                       \
-                                       v0->v.specular.red   = v1->v.specular.red;      \
-                                       v0->v.specular.green = v1->v.specular.green;    \
-                                       v0->v.specular.blue  = v1->v.specular.blue; }
-#define VERT_SAVE_SPEC( idx )    if (havespec) spec[idx] = CPU_TO_LE32(v[idx]->ui[5])
-#define VERT_RESTORE_SPEC( idx ) if (havespec) v[idx]->ui[5] = LE32_TO_CPU(spec[idx])
+
+#define VERT_SET_SPEC( v0, c )                                 \
+do {                                                           \
+   if (havespec) {                                             \
+      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.red, (c)[0]);    \
+      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.green, (c)[1]);  \
+      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.blue, (c)[2]);   \
+   }                                                           \
+} while (0)
+#define VERT_COPY_SPEC( v0, v1 )                       \
+do {                                                   \
+   if (havespec) {                                     \
+      v0->v.specular.red   = v1->v.specular.red;       \
+      v0->v.specular.green = v1->v.specular.green;     \
+      v0->v.specular.blue  = v1->v.specular.blue;      \
+   }                                                   \
+} while (0)
+
+/* These don't need LE32_TO_CPU() as they used to save and restore
+ * colors which are already in the correct format.
+ */
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_SAVE_SPEC( idx )    if (havespec) spec[idx] = v[idx]->ui[5]
+#define VERT_RESTORE_SPEC( idx ) if (havespec) v[idx]->ui[5] = spec[idx]
 
 #undef LOCAL_VARS
 #undef TAG
@@ -1051,22 +973,6 @@ static struct {
 #define TAG(x) x##_twoside_unfilled
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (RADEON_OFFSET_BIT)
-#define TAG(x) x##_offset
-#include "tnl_dd/t_dd_tritmp.h"
-
-#define IND (RADEON_TWOSIDE_BIT|RADEON_OFFSET_BIT)
-#define TAG(x) x##_twoside_offset
-#include "tnl_dd/t_dd_tritmp.h"
-
-#define IND (RADEON_UNFILLED_BIT|RADEON_OFFSET_BIT)
-#define TAG(x) x##_unfilled_offset
-#include "tnl_dd/t_dd_tritmp.h"
-
-#define IND (RADEON_TWOSIDE_BIT|RADEON_UNFILLED_BIT|RADEON_OFFSET_BIT)
-#define TAG(x) x##_twoside_unfilled_offset
-#include "tnl_dd/t_dd_tritmp.h"
-
 
 static void init_rast_tab( void )
 {
@@ -1074,17 +980,13 @@ static void init_rast_tab( void )
    init_twoside();
    init_unfilled();
    init_twoside_unfilled();
-   init_offset();
-   init_twoside_offset();
-   init_unfilled_offset();
-   init_twoside_unfilled_offset();
 }
 
 /**********************************************************************/
 /*               Render unclipped begin/end objects                   */
 /**********************************************************************/
 
-#define VERT(x) (radeonVertex *)(radeonverts + (x << shift))
+#define VERT(x) (radeonVertex *)(radeonverts + (x * vertsize * sizeof(int)))
 #define RENDER_POINTS( start, count )          \
    for ( ; start < count ; start++)            \
       radeon_point( rmesa, VERT(start) )
@@ -1101,7 +1003,7 @@ static void init_rast_tab( void )
 #undef LOCAL_VARS
 #define LOCAL_VARS                                             \
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);               \
-   const GLuint shift = rmesa->swtcl.vertex_stride_shift;              \
+   const GLuint vertsize = rmesa->swtcl.vertex_size;           \
    const char *radeonverts = (char *)rmesa->swtcl.verts;               \
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;       \
    const GLboolean stipple = ctx->Line.StippleFlag;            \
@@ -1136,8 +1038,6 @@ void radeonChooseRenderState( GLcontext *ctx )
 
    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= RADEON_TWOSIDE_BIT;
    if (flags & DD_TRI_UNFILLED)      index |= RADEON_UNFILLED_BIT;
-   if ((flags & DD_TRI_OFFSET) &&
-       rmesa->dri.drmMinor == 1)  index |= RADEON_OFFSET_BIT;
 
    if (index != rmesa->swtcl.RenderIndex) {
       tnl->Driver.Render.Points = rast_tab[index].points;
@@ -1271,7 +1171,7 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 void radeonFlushVertices( GLcontext *ctx, GLuint flags )
 {
-   _tnl_flush_vertices( ctx, flags );
+   _tnl_FlushVertices( ctx, flags );
 
    if (flags & FLUSH_STORED_VERTICES)
       RADEON_NEWPRIM( RADEON_CONTEXT( ctx ) );
@@ -1300,7 +1200,7 @@ void radeonInitSwtcl( GLcontext *ctx )
    tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
    tnl->Driver.Render.BuildVertices = radeonBuildVertices;
 
-   rmesa->swtcl.verts = ALIGN_MALLOC( size * 16 * 4, 32 );
+   rmesa->swtcl.verts = (GLubyte *)ALIGN_MALLOC( size * 16 * 4, 32 );
    rmesa->swtcl.RenderIndex = ~0;
    rmesa->swtcl.render_primitive = GL_TRIANGLES;
    rmesa->swtcl.hw_primitive = 0;
@@ -1320,13 +1220,4 @@ void radeonDestroySwtcl( GLcontext *ctx )
       rmesa->swtcl.verts = 0;
    }
 
-   if (rmesa->UbyteSecondaryColor.Ptr) {
-      ALIGN_FREE(rmesa->UbyteSecondaryColor.Ptr);
-      rmesa->UbyteSecondaryColor.Ptr = 0;
-   }
-
-   if (rmesa->UbyteColor.Ptr) {
-      ALIGN_FREE(rmesa->UbyteColor.Ptr);
-      rmesa->UbyteColor.Ptr = 0;
-   }
 }