Bugzilla #2195: Convert the radeon driver to the t_vertex interface. This cuts
authorEric Anholt <anholt@FreeBSD.org>
Tue, 31 May 2005 04:04:24 +0000 (04:04 +0000)
committerEric Anholt <anholt@FreeBSD.org>
Tue, 31 May 2005 04:04:24 +0000 (04:04 +0000)
about 200 lines from the code and 25k from the binary, while matching other
drivers more closely.  In the worst case (tcl_mode=0) it appears to have
a performance cost of 4.4% +/- 0.3% on quake3 (800x600 demofours, 1ghz p3,
rv200).  Tested on ut2004, ut, q3, projtex.

Submitted by: Andreas Stenglein <a.stenglein@gmx.net>

src/mesa/drivers/dri/radeon/radeon_context.c
src/mesa/drivers/dri/radeon/radeon_context.h
src/mesa/drivers/dri/radeon/radeon_swtcl.c

index 60eecc741c865c45c4fbbb404a92a499011cfcc7..98177bbb4a933b8b30a8760eaf0bae3febc766f0 100644 (file)
@@ -63,7 +63,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_vtxfmt.h"
 #include "radeon_maos.h"
 
-#define DRIVER_DATE    "20041207"
+#define DRIVER_DATE    "20050528"
 
 #include "vblank.h"
 #include "utils.h"
index 8d0637ca326f0beba11ad0640451f19fe556f588..3019602b7cff9c2ba87265bebbf71bde997b0686 100644 (file)
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __RADEON_CONTEXT_H__
 #define __RADEON_CONTEXT_H__
 
+#include "tnl/t_vertex.h"
 #include "dri_util.h"
 #include "drm.h"
 #include "radeon_drm.h"
@@ -530,12 +531,13 @@ struct radeon_tcl_info {
 /* radeon_swtcl.c
  */
 struct radeon_swtcl_info {
-   GLuint SetupIndex;
-   GLuint SetupNewInputs;
    GLuint RenderIndex;
    GLuint vertex_size;
-   GLuint vertex_stride_shift;
    GLuint vertex_format;
+
+   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+   GLuint vertex_attr_count;
+
    GLubyte *verts;
 
    /* Fallback rasterization functions
@@ -548,6 +550,18 @@ struct radeon_swtcl_info {
    GLenum render_primitive;
    GLuint numverts;
 
+   /**
+    * Offset of the 4UB color data within a hardware (swtcl) vertex.
+    */
+   GLuint coloroffset;
+
+   /**
+    * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
+    */
+   GLuint specoffset;
+
+   GLboolean needproj;
+
    struct radeon_dma_region indexed_verts;
 };
 
@@ -707,6 +721,7 @@ struct radeon_context {
    GLuint TclFallback;
    GLuint Fallback;
    GLuint NewGLState;
+   GLuint tnl_index;   /* index of bits for last tnl_install_attrs */
 
    /* Vertex buffers
     */
index 4f2198ac5b56c2b0d4af1a5dbe8bf940ee47023b..57c39714d49729511dba07a6bd0daf615b9b6b47 100644 (file)
@@ -53,224 +53,175 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_swtcl.h"
 #include "radeon_tcl.h"
 
-/***********************************************************************
- *              Build render functions from dd templates               *
- ***********************************************************************/
-
-
-#define RADEON_XYZW_BIT                0x01
-#define RADEON_RGBA_BIT                0x02
-#define RADEON_SPEC_BIT                0x04
-#define RADEON_TEX0_BIT                0x08
-#define RADEON_TEX1_BIT                0x10
-#define RADEON_PTEX_BIT                0x20
-#define RADEON_MAX_SETUP       0x40
 
 static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
 
-static struct {
-   void                (*emit)( GLcontext *, GLuint, GLuint, void *, GLuint );
-   tnl_interp_func             interp;
-   tnl_copy_pv_func            copy_pv;
-   GLboolean           (*check_tex_sizes)( GLcontext *ctx );
-   GLuint               vertex_size;
-   GLuint               vertex_format;
-} setup_tab[RADEON_MAX_SETUP];
-
-
-#define TINY_VERTEX_FORMAT             (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_PKCOLOR)
-
-#define NOTEX_VERTEX_FORMAT            (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC)
-
-#define TEX0_VERTEX_FORMAT             (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC |     \
-                                        RADEON_CP_VC_FRMT_ST0)
-
-#define TEX1_VERTEX_FORMAT             (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC |     \
-                                        RADEON_CP_VC_FRMT_ST0 |        \
-                                        RADEON_CP_VC_FRMT_ST1)
-
-#define PROJ_TEX1_VERTEX_FORMAT                (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC |     \
-                                        RADEON_CP_VC_FRMT_ST0 |        \
-                                        RADEON_CP_VC_FRMT_Q0 |         \
-                                        RADEON_CP_VC_FRMT_ST1 |        \
-                                        RADEON_CP_VC_FRMT_Q1)
-
-#define TEX2_VERTEX_FORMAT 0
-#define TEX3_VERTEX_FORMAT 0
-#define PROJ_TEX3_VERTEX_FORMAT 0
-
-#define DO_XYZW (IND & RADEON_XYZW_BIT)
-#define DO_RGBA (IND & RADEON_RGBA_BIT)
-#define DO_SPEC (IND & RADEON_SPEC_BIT)
-#define DO_FOG  (IND & RADEON_SPEC_BIT)
-#define DO_TEX0 (IND & RADEON_TEX0_BIT)
-#define DO_TEX1 (IND & RADEON_TEX1_BIT)
-#define DO_TEX2 0
-#define DO_TEX3 0
-#define DO_PTEX (IND & RADEON_PTEX_BIT)
-
-#define VERTEX radeonVertex
-#define VERTEX_COLOR radeon_color_t
-#define GET_VIEWPORT_MAT() 0
-#define GET_TEXSOURCE(n)  n
-#define GET_VERTEX_FORMAT() RADEON_CONTEXT(ctx)->swtcl.vertex_format
-#define GET_VERTEX_STORE() RADEON_CONTEXT(ctx)->swtcl.verts
-#define GET_VERTEX_SIZE() RADEON_CONTEXT(ctx)->swtcl.vertex_size * sizeof(GLuint)
-
-#define HAVE_HW_VIEWPORT    1
-/* Tiny vertices don't seem to work atm - haven't looked into why.
- */
-#define HAVE_HW_DIVIDE      (IND & ~(RADEON_XYZW_BIT|RADEON_RGBA_BIT))
-#define HAVE_TINY_VERTICES  1
-#define HAVE_RGBA_COLOR     1
-#define HAVE_NOTEX_VERTICES 1
-#define HAVE_TEX0_VERTICES  1
-#define HAVE_TEX1_VERTICES  1
-#define HAVE_TEX2_VERTICES  0
-#define HAVE_TEX3_VERTICES  0
-#define HAVE_PTEX_VERTICES  1
-
-#define CHECK_HW_DIVIDE    (!(ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE| \
-                                                    DD_TRI_UNFILLED)))
-
-#define INTERP_VERTEX setup_tab[RADEON_CONTEXT(ctx)->swtcl.SetupIndex].interp
-#define COPY_PV_VERTEX setup_tab[RADEON_CONTEXT(ctx)->swtcl.SetupIndex].copy_pv
-
+/* R100: xyzw, c0, c1/fog, stq[0..2]  = 4+1+1+3*3 = 15  right? */
+/* R200: xyzw, c0, c1/fog, strq[0..5] = 4+1+1+4*6 = 30 */
+#define RADEON_MAX_TNL_VERTEX_SIZE (15 * sizeof(GLfloat))      /* for mesa _tnl stage */
 
 /***********************************************************************
- *         Generate  pv-copying and translation functions              *
- ***********************************************************************/
-
-#define TAG(x) radeon_##x
-#define IND ~0
-#include "tnl_dd/t_dd_vb.c"
-#undef IND
-
-
-/***********************************************************************
- *             Generate vertex emit and interp functions               *
+ *                         Initialization 
  ***********************************************************************/
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT)
-#define TAG(x) x##_wg
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT)
-#define TAG(x) x##_wgt0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT|RADEON_PTEX_BIT)
-#define TAG(x) x##_wgpt0
-#include "tnl_dd/t_dd_vbtmp.h"
+#define EMIT_ATTR( ATTR, STYLE, F0 )                                   \
+do {                                                                   \
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);  \
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE); \
+   rmesa->swtcl.vertex_attr_count++;                                   \
+   fmt_0 |= F0;                                                                \
+} while (0)
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT|RADEON_TEX1_BIT)
-#define TAG(x) x##_wgt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
+#define EMIT_PAD( N )                                                  \
+do {                                                                   \
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;               \
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;        \
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);             \
+   rmesa->swtcl.vertex_attr_count++;                                   \
+} while (0)
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT|RADEON_TEX1_BIT|\
-             RADEON_PTEX_BIT)
-#define TAG(x) x##_wgpt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
+static GLuint radeon_cp_vc_frmts[3][2] =
+{
+   { RADEON_CP_VC_FRMT_ST0, RADEON_CP_VC_FRMT_ST0 | RADEON_CP_VC_FRMT_Q0 },
+   { RADEON_CP_VC_FRMT_ST1, RADEON_CP_VC_FRMT_ST1 | RADEON_CP_VC_FRMT_Q1 },
+   { RADEON_CP_VC_FRMT_ST2, RADEON_CP_VC_FRMT_ST2 | RADEON_CP_VC_FRMT_Q2 },
+};
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT)
-#define TAG(x) x##_wgfs
-#include "tnl_dd/t_dd_vbtmp.h"
+static void radeonSetVertexFormat( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   GLuint index = tnl->render_inputs;
+   int fmt_0 = 0;
+   int offset = 0;
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT)
-#define TAG(x) x##_wgfst0
-#include "tnl_dd/t_dd_vbtmp.h"
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT|RADEON_PTEX_BIT)
-#define TAG(x) x##_wgfspt0
-#include "tnl_dd/t_dd_vbtmp.h"
+   /* Important:
+    */
+   if ( VB->NdcPtr != NULL ) {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
+   }
+   else {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+   }
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT|RADEON_TEX1_BIT)
-#define TAG(x) x##_wgfst0t1
-#include "tnl_dd/t_dd_vbtmp.h"
+   assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+   rmesa->swtcl.vertex_attr_count = 0;
+
+   /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+    * build up a hardware vertex.
+    */
+   if ( !rmesa->swtcl.needproj ||
+        (index & _TNL_BITS_TEX_ANY)) { /* for projtex */
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F, 
+                RADEON_CP_VC_FRMT_XY | RADEON_CP_VC_FRMT_Z | RADEON_CP_VC_FRMT_W0 );
+      offset = 4;
+   }
+   else {
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_3F, 
+                RADEON_CP_VC_FRMT_XY | RADEON_CP_VC_FRMT_Z );
+      offset = 3;
+   }
 
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT|RADEON_TEX1_BIT|RADEON_PTEX_BIT)
-#define TAG(x) x##_wgfspt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
+   rmesa->swtcl.coloroffset = offset;
+#if MESA_LITTLE_ENDIAN 
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, 
+             RADEON_CP_VC_FRMT_PKCOLOR );
+#else
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR,
+             RADEON_CP_VC_FRMT_PKCOLOR );
+#endif
+   offset += 1;
 
+   rmesa->swtcl.specoffset = 0;
+   if (index & (_TNL_BIT_COLOR1|_TNL_BIT_FOG)) {
 
-/***********************************************************************
- *                         Initialization 
- ***********************************************************************/
+#if MESA_LITTLE_ENDIAN 
+      if (index & _TNL_BIT_COLOR1) {
+        rmesa->swtcl.specoffset = offset;
+        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB,
+                   RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+        EMIT_PAD( 3 );
+      }
 
-static void init_setup_tab( void )
-{
-   init_wg();
-   init_wgt0();
-   init_wgpt0();
-   init_wgt0t1();
-   init_wgpt0t1();
-   init_wgfs();
-   init_wgfst0();
-   init_wgfspt0();
-   init_wgfst0t1();
-   init_wgfspt0t1();
-}
+      if (index & _TNL_BIT_FOG) {
+        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F,
+                   RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+        EMIT_PAD( 1 );
+      }
+#else
+      if (index & _TNL_BIT_FOG) {
+        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F,
+                   RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+        EMIT_PAD( 1 );
+      }
 
+      if (index & _TNL_BIT_COLOR1) {
+        rmesa->swtcl.specoffset = offset;
+        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_BGR,
+                   RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+        EMIT_PAD( 3 );
+      }
+#endif
+   }
 
+   if (index & _TNL_BITS_TEX_ANY) {
+      int i;
+
+      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+        if (index & _TNL_BIT_TEX(i)) {
+           GLuint sz = VB->TexCoordPtr[i]->size;
+
+           switch (sz) {
+           case 1:
+           case 2:
+           case 3:
+              EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_2F,
+                         radeon_cp_vc_frmts[i][0] );
+              break;
+           case 4:
+              EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_3F_XYW,
+                         radeon_cp_vc_frmts[i][1] );
+              break;
+           default:
+              continue;
+           };
+        }
+      }
+   }
 
-void radeonPrintSetupFlags(char *msg, GLuint flags )
-{
-   fprintf(stderr, "%s(%x): %s%s%s%s%s%s\n",
-          msg,
-          (int)flags,
-          (flags & RADEON_XYZW_BIT)      ? " xyzw," : "",
-          (flags & RADEON_RGBA_BIT)     ? " rgba," : "",
-          (flags & RADEON_SPEC_BIT)     ? " spec/fog," : "",
-          (flags & RADEON_TEX0_BIT)     ? " tex-0," : "",
-          (flags & RADEON_TEX1_BIT)     ? " tex-1," : "",
-          (flags & RADEON_PTEX_BIT)     ? " proj-tex," : "");
+   if ( rmesa->tnl_index != index ||
+       fmt_0 != rmesa->swtcl.vertex_format) {
+      RADEON_NEWPRIM(rmesa);
+      rmesa->swtcl.vertex_format = fmt_0;
+      rmesa->swtcl.vertex_size =
+         _tnl_install_attrs( ctx,
+                             rmesa->swtcl.vertex_attrs, 
+                             rmesa->swtcl.vertex_attr_count,
+                             NULL, 0 );
+      rmesa->swtcl.vertex_size /= 4;
+      rmesa->tnl_index = index;
+      if (RADEON_DEBUG & DEBUG_VERTS)
+        fprintf( stderr, "%s: vertex_size= %d floats\n",
+                 __FUNCTION__, rmesa->swtcl.vertex_size);
+   }
 }
 
 
 static void radeonRenderStart( GLcontext *ctx )
 {
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
    radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
 
-   if (!setup_tab[rmesa->swtcl.SetupIndex].check_tex_sizes(ctx)) {
-      GLuint ind = rmesa->swtcl.SetupIndex |= (RADEON_PTEX_BIT|RADEON_RGBA_BIT);
-
-      /* Projective textures are handled nicely; just have to change
-       * up to the new vertex format.
-       */
-      if (setup_tab[ind].vertex_format != rmesa->swtcl.vertex_format) {
-        RADEON_NEWPRIM(rmesa);
-        rmesa->swtcl.vertex_format = setup_tab[ind].vertex_format;
-        rmesa->swtcl.vertex_size = setup_tab[ind].vertex_size;
-      }
-
-      if (!(ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
-        tnl->Driver.Render.Interp = setup_tab[rmesa->swtcl.SetupIndex].interp;
-        tnl->Driver.Render.CopyPV = setup_tab[rmesa->swtcl.SetupIndex].copy_pv;
-      }
-   }
+   radeonSetVertexFormat( ctx );
    
    if (rmesa->dma.flush != 0 && 
        rmesa->dma.flush != flush_last_swtcl_prim)
@@ -278,82 +229,40 @@ static void radeonRenderStart( GLcontext *ctx )
 }
 
 
-void radeonBuildVertices( GLcontext *ctx, GLuint start, GLuint count,
-                          GLuint newinputs )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-   GLuint stride = rmesa->swtcl.vertex_size * sizeof(int);
-   GLubyte *v = ((GLubyte *)rmesa->swtcl.verts + (start * stride));
-
-   newinputs |= rmesa->swtcl.SetupNewInputs;
-   rmesa->swtcl.SetupNewInputs = 0;
-
-   if (!newinputs)
-      return;
-
-   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, v, stride );
-}
-
+/**
+ * Set vertex state for SW TCL.  The primary purpose of this function is to
+ * determine in advance whether or not the hardware can / should do the
+ * projection divide or Mesa should do it.
+ */
 void radeonChooseVertexState( GLcontext *ctx )
 {
    radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint ind = (RADEON_XYZW_BIT | RADEON_RGBA_BIT);
-
-   if (!rmesa->TclFallback || rmesa->Fallback)
-      return;
-
-   if (ctx->Fog.Enabled || (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR))
-      ind |= RADEON_SPEC_BIT;
 
-   if (ctx->Texture._EnabledUnits & 0x2)
-      /* unit 1 enabled */
-      ind |= RADEON_TEX0_BIT|RADEON_TEX1_BIT;
-   else if (ctx->Texture._EnabledUnits & 0x1)
-      /* unit 0 enabled */
-      ind |= RADEON_TEX0_BIT;
+   GLuint se_coord_fmt;
 
-   rmesa->swtcl.SetupIndex = ind;
+   /* HW perspective divide is a win, but tiny vertex formats are a
+    * bigger one.
+    */
 
-   if (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED)) {
-      tnl->Driver.Render.Interp = radeon_interp_extras;
-      tnl->Driver.Render.CopyPV = radeon_copy_pv_extras;
+   if ( ((tnl->render_inputs & (_TNL_BITS_TEX_ANY|_TNL_BIT_COLOR1) ) == 0)
+       || (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
+      rmesa->swtcl.needproj = GL_TRUE;
+      se_coord_fmt = (RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
+                     RADEON_VTX_Z_PRE_MULT_1_OVER_W0 |
+                     RADEON_TEX1_W_ROUTING_USE_Q1);
    }
    else {
-      tnl->Driver.Render.Interp = setup_tab[ind].interp;
-      tnl->Driver.Render.CopyPV = setup_tab[ind].copy_pv;
+      rmesa->swtcl.needproj = GL_FALSE;
+      se_coord_fmt = (RADEON_VTX_W0_IS_NOT_1_OVER_W0 |
+                     RADEON_TEX1_W_ROUTING_USE_Q1);
    }
 
-   if (setup_tab[ind].vertex_format != rmesa->swtcl.vertex_format) {
-      RADEON_NEWPRIM(rmesa);
-      rmesa->swtcl.vertex_format = setup_tab[ind].vertex_format;
-      rmesa->swtcl.vertex_size = setup_tab[ind].vertex_size;
-   }
+   _tnl_need_projected_coords( ctx, rmesa->swtcl.needproj );
 
-   {
-      GLuint se_coord_fmt, needproj;
-
-      /* HW perspective divide is a win, but tiny vertex formats are a
-       * bigger one.
-       */
-      if (setup_tab[ind].vertex_format == TINY_VERTEX_FORMAT ||
-         (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
-        needproj = GL_TRUE;
-        se_coord_fmt = (RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
-                        RADEON_VTX_Z_PRE_MULT_1_OVER_W0 |
-                        RADEON_TEX1_W_ROUTING_USE_Q1);
-      }
-      else {
-        needproj = GL_FALSE;
-        se_coord_fmt = (RADEON_VTX_W0_IS_NOT_1_OVER_W0 |
-                        RADEON_TEX1_W_ROUTING_USE_Q1);
-      }
-
-      if ( se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT] ) {
-        RADEON_STATECHANGE( rmesa, set );
-        rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
-      }
-      _tnl_need_projected_coords( ctx, needproj );
+   if ( se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT] ) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
    }
 }
 
@@ -431,38 +340,6 @@ static __inline void *radeonAllocDmaLowVerts( radeonContextPtr rmesa,
 }
 
 
-
-
-static void *radeon_emit_contiguous_verts( GLcontext *ctx, 
-                                          GLuint start, 
-                                          GLuint count,
-                                          void *dest)
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint stride = rmesa->swtcl.vertex_size * 4;
-   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, dest, stride );
-   return (void *)((char *)dest + stride * (count - start));
-}
-
-
-
-void radeon_emit_indexed_verts( GLcontext *ctx, GLuint start, GLuint count )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   radeonAllocDmaRegionVerts( rmesa, 
-                             &rmesa->swtcl.indexed_verts, 
-                             count - start,
-                             rmesa->swtcl.vertex_size * 4, 
-                             64);
-
-   setup_tab[rmesa->swtcl.SetupIndex].emit( 
-      ctx, start, count, 
-      rmesa->swtcl.indexed_verts.address + rmesa->swtcl.indexed_verts.start, 
-      rmesa->swtcl.vertex_size * 4 );
-}
-
-
 /*
  * Render unclipped vertex buffers by emitting vertices directly to
  * dma buffers.  Use strip/fan hardware primitives where possible.
@@ -478,7 +355,8 @@ void radeon_emit_indexed_verts( GLcontext *ctx, GLuint start, GLuint count )
 #define HAVE_QUADS       0
 #define HAVE_QUAD_STRIPS 0
 #define HAVE_POLYGONS    0
-#define HAVE_ELTS        1
+/* \todo: is it possible to make "ELTS" work with t_vertex code ? */
+#define HAVE_ELTS        0
 
 static const GLuint hw_prim[GL_POLYGON+1] = {
    RADEON_CP_VC_CNTL_PRIM_TYPE_POINT,
@@ -500,94 +378,17 @@ static __inline void radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
    assert(rmesa->dma.current.ptr == rmesa->dma.current.start);
 }
 
-static __inline void radeonEltPrimitive( radeonContextPtr rmesa, GLenum prim )
-{
-   RADEON_NEWPRIM( rmesa );
-   rmesa->swtcl.hw_primitive = hw_prim[prim] | RADEON_CP_VC_CNTL_PRIM_WALK_IND;
-}
-
-
-
-
-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
-#define ELTS_VARS( buf )  GLushort *dest = buf; (void)rmesa;
+#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx); (void)rmesa
 #define INIT( prim ) radeonDmaPrimitive( rmesa, prim )
-#define ELT_INIT(prim) radeonEltPrimitive( rmesa, prim )
 #define FLUSH()  RADEON_NEWPRIM( rmesa )
 #define GET_CURRENT_VB_MAX_VERTS() \
   (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
 #define GET_SUBSEQUENT_VB_MAX_VERTS() \
   ((RADEON_BUFFER_SIZE) / (rmesa->swtcl.vertex_size*4))
-
-#if RADEON_OLD_PACKETS
-# define GET_CURRENT_VB_MAX_ELTS() \
-  ((RADEON_CMD_BUF_SZ - (rmesa->store.cmd_used + 24)) / 2)
-#else
-# define GET_CURRENT_VB_MAX_ELTS() \
-  ((RADEON_CMD_BUF_SZ - (rmesa->store.cmd_used + 16)) / 2)
-#endif
-#define GET_SUBSEQUENT_VB_MAX_ELTS() \
-  ((RADEON_CMD_BUF_SZ - 1024) / 2)
-
-
-static void *radeon_alloc_elts( radeonContextPtr rmesa, int nr )
-{
-   if (rmesa->dma.flush == radeonFlushElts &&
-       rmesa->store.cmd_used + nr*2 < RADEON_CMD_BUF_SZ) {
-
-      rmesa->store.cmd_used += nr*2;
-
-      return (void *)(rmesa->store.cmd_buf + rmesa->store.cmd_used);
-   }
-   else {
-      if (rmesa->dma.flush) {
-        rmesa->dma.flush( rmesa );
-      }
-
-      radeonEnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
-                              rmesa->hw.max_state_size + ELTS_BUFSZ(nr) );
-
-      radeonEmitVertexAOS( rmesa,
-                          rmesa->swtcl.vertex_size,
-                          (rmesa->radeonScreen->gart_buffer_offset +
-                           rmesa->swtcl.indexed_verts.buf->buf->idx *
-                           RADEON_BUFFER_SIZE +
-                           rmesa->swtcl.indexed_verts.start));
-
-      return (void *) radeonAllocEltsOpenEnded( rmesa,
-                                               rmesa->swtcl.vertex_format,
-                                               rmesa->swtcl.hw_primitive,
-                                               nr );
-   }
-}
-
-#define ALLOC_ELTS(nr) radeon_alloc_elts(rmesa, nr)
-
-#ifdef MESA_BIG_ENDIAN
-/* We could do without (most of) this ugliness if dest was always 32 bit word aligned... */
-#define EMIT_ELT(offset, x) do {                               \
-       int off = offset + ( ( (GLuint)dest & 0x2 ) >> 1 );     \
-       GLushort *des = (GLushort *)( (GLuint)dest & ~0x2 );    \
-       (des)[ off + 1 - 2 * ( off & 1 ) ] = (GLushort)(x);     \
-       (void)rmesa; } while (0)
-#else
-#define EMIT_ELT(offset, x) do {                               \
-       (dest)[offset] = (GLushort) (x);                        \
-       (void)rmesa; } while (0)
-#endif
-#define EMIT_TWO_ELTS(offset, x, y)  *(GLuint *)(dest+offset) = ((y)<<16)|(x);
-#define INCR_ELTS( nr ) dest += nr
-#define ELTPTR dest
-#define RELEASE_ELT_VERTS() \
-  radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ )
-#define EMIT_INDEXED_VERTS( ctx, start, count ) \
-  radeon_emit_indexed_verts( ctx, start, count )
-
-
 #define ALLOC_VERTS( nr ) \
   radeonAllocDmaLowVerts( rmesa, nr, rmesa->swtcl.vertex_size * 4 )
 #define EMIT_VERTS( ctx, j, nr, buf ) \
-  radeon_emit_contiguous_verts(ctx, j, (j)+(nr), buf)
+  _tnl_emit_vertices_to_buffer(ctx, j, (j)+(nr), buf)
 
 #define TAG(x) radeon_dma_##x
 #include "tnl_dd/t_dd_dmatmp.h"
@@ -616,15 +417,6 @@ static GLboolean radeon_run_render( GLcontext *ctx,
 
    tnl->Driver.Render.Start( ctx );
 
-   if (VB->Elts) {
-      tab = TAG(render_tab_elts);
-      if (!rmesa->swtcl.indexed_verts.buf) {
-        if (VB->Count > GET_SUBSEQUENT_VB_MAX_VERTS())
-           return GL_TRUE;
-        EMIT_INDEXED_VERTS(ctx, 0, VB->Count);
-      }
-   }
-
    for (i = 0 ; i < VB->PrimitiveCount ; i++)
    {
       GLuint prim = VB->Primitive[i].mode;
@@ -706,7 +498,7 @@ static GLboolean run_texrect_stage( GLcontext *ctx,
            in = (GLfloat *)((GLubyte *)in + instride);
         }
 
-        VB->TexCoordPtr[i] = &store->texcoord[i];
+        VB->AttribPtr[VERT_ATTRIB_TEX0+i] = VB->TexCoordPtr[i] = &store->texcoord[i];
       }
    }
 
@@ -789,12 +581,12 @@ static void radeonResetLineStipple( GLcontext *ctx );
 #define CTX_ARG radeonContextPtr rmesa
 #define CTX_ARG2 rmesa
 #define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) radeonAllocDmaLowVerts( rmesa, n, size * 4 )
+#define ALLOC_VERTS( n, size ) radeonAllocDmaLowVerts( rmesa, n, (size) * 4 )
 #undef LOCAL_VARS
 #define LOCAL_VARS                                             \
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);               \
    const char *radeonverts = (char *)rmesa->swtcl.verts;
-#define VERT(x) (radeonVertex *)(radeonverts + (x * vertsize * sizeof(int)))
+#define VERT(x) (radeonVertex *)(radeonverts + ((x) * (vertsize) * sizeof(int)))
 #define VERTEX radeonVertex 
 #undef TAG
 #define TAG(x) radeon_##x
@@ -851,7 +643,7 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e * rmesa->swtcl.vertex_size * sizeof(int)))
+#define GET_VERTEX(e) (rmesa->swtcl.verts + ((e) * rmesa->swtcl.vertex_size * sizeof(int)))
 
 #define VERT_SET_RGBA( v, c )                                          \
 do {                                                           \
@@ -864,20 +656,23 @@ do {                                                              \
 
 #define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
 
-#define VERT_SET_SPEC( v0, c )                                 \
+#define VERT_SET_SPEC( v, c )                                  \
 do {                                                           \
-   if (havespec) {                                             \
-      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.red, (c)[0]);    \
-      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.green, (c)[1]);  \
-      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.blue, (c)[2]);   \
+   if (specoffset) {                                           \
+      radeon_color_t *spec = (radeon_color_t *)&((v)->ui[specoffset]); \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);     \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);   \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);    \
    }                                                           \
 } while (0)
 #define VERT_COPY_SPEC( v0, v1 )                       \
 do {                                                   \
-   if (havespec) {                                     \
-      v0->v.specular.red   = v1->v.specular.red;       \
-      v0->v.specular.green = v1->v.specular.green;     \
-      v0->v.specular.blue  = v1->v.specular.blue;      \
+   if (specoffset) {                                   \
+      radeon_color_t *spec0 = (radeon_color_t *)&((v0)->ui[specoffset]);       \
+      radeon_color_t *spec1 = (radeon_color_t *)&((v1)->ui[specoffset]);       \
+      spec0->red   = spec1->red;       \
+      spec0->green = spec1->green;     \
+      spec0->blue  = spec1->blue;      \
    }                                                   \
 } while (0)
 
@@ -886,8 +681,8 @@ do {                                                        \
  */
 #define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
 #define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
-#define VERT_SAVE_SPEC( idx )    if (havespec) spec[idx] = v[idx]->ui[5]
-#define VERT_RESTORE_SPEC( idx ) if (havespec) v[idx]->ui[5] = spec[idx]
+#define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
+#define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
 
 #undef LOCAL_VARS
 #undef TAG
@@ -896,9 +691,9 @@ do {                                                        \
 #define LOCAL_VARS(n)                                                  \
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);                       \
    GLuint color[n], spec[n];                                           \
-   GLuint coloroffset = (rmesa->swtcl.vertex_size == 4 ? 3 : 4);       \
-   GLboolean havespec = (rmesa->swtcl.vertex_size > 4);                        \
-   (void) color; (void) spec; (void) coloroffset; (void) havespec;
+   GLuint coloroffset = rmesa->swtcl.coloroffset;      \
+   GLuint specoffset = rmesa->swtcl.specoffset;                        \
+   (void) color; (void) spec; (void) coloroffset; (void) specoffset;
 
 /***********************************************************************
  *                Helpers for rendering unfilled primitives            *
@@ -946,7 +741,6 @@ static void init_rast_tab( void )
 /*               Render unclipped begin/end objects                   */
 /**********************************************************************/
 
-#define VERT(x) (radeonVertex *)(radeonverts + (x * vertsize * sizeof(int)))
 #define RENDER_POINTS( start, count )          \
    for ( ; start < count ; start++)            \
       radeon_point( rmesa, VERT(start) )
@@ -1109,7 +903,11 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
         tnl->Driver.Render.Start = radeonRenderStart;
         tnl->Driver.Render.PrimitiveNotify = radeonRenderPrimitive;
         tnl->Driver.Render.Finish = radeonRenderFinish;
-        tnl->Driver.Render.BuildVertices = radeonBuildVertices;
+
+        tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+        tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+        tnl->Driver.Render.Interp = _tnl_interp;
+
         tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
         TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_FALSE );
         if (rmesa->TclFallback) {
@@ -1145,12 +943,10 @@ void radeonInitSwtcl( GLcontext *ctx )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint size = TNL_CONTEXT(ctx)->vb.Size;
    static int firsttime = 1;
 
    if (firsttime) {
       init_rast_tab();
-      init_setup_tab();
       firsttime = 0;
    }
 
@@ -1158,9 +954,14 @@ void radeonInitSwtcl( GLcontext *ctx )
    tnl->Driver.Render.Finish = radeonRenderFinish;
    tnl->Driver.Render.PrimitiveNotify = radeonRenderPrimitive;
    tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
-   tnl->Driver.Render.BuildVertices = radeonBuildVertices;
+   tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+   tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+   tnl->Driver.Render.Interp = _tnl_interp;
 
-   rmesa->swtcl.verts = (GLubyte *)ALIGN_MALLOC( size * 16 * 4, 32 );
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+                      RADEON_MAX_TNL_VERTEX_SIZE);
+   
+   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
    rmesa->swtcl.RenderIndex = ~0;
    rmesa->swtcl.render_primitive = GL_TRIANGLES;
    rmesa->swtcl.hw_primitive = 0;
@@ -1174,10 +975,4 @@ void radeonDestroySwtcl( GLcontext *ctx )
    if (rmesa->swtcl.indexed_verts.buf) 
       radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
                              __FUNCTION__ );
-
-   if (rmesa->swtcl.verts) {
-      ALIGN_FREE(rmesa->swtcl.verts);
-      rmesa->swtcl.verts = NULL;
-   }
-
 }