radeon: stupid mesa extension fail

[mesa.git] / src / mesa / drivers / dri / radeon / radeon_swtcl.c
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c

index b6014cd5fa0e40e6bf2a896f227e33c1666021fc..af933a35f2b50dabac2f327b82773d58478da943 100644 (file)
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -1,4 +1,3 @@
-/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_swtcl.c,v 1.6 2003/05/06 23:52:08 daenzer Exp $ */
  /**************************************************************************
  
  Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
@@ -33,19 +32,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
  
-#include "glheader.h"
-#include "mtypes.h"
-#include "colormac.h"
-#include "enums.h"
-#include "imports.h"
-#include "macros.h"
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+#include "main/enums.h"
+#include "main/imports.h"
+#include "main/macros.h"
  
  #include "swrast_setup/swrast_setup.h"
  #include "math/m_translate.h"
  #include "tnl/tnl.h"
  #include "tnl/t_context.h"
  #include "tnl/t_pipeline.h"
-#include "tnl/t_vtx_api.h"     /* for _tnl_FlushVertices */
  
  #include "radeon_context.h"
  #include "radeon_ioctl.h"
@@ -53,421 +51,257 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  #include "radeon_swtcl.h"
  #include "radeon_tcl.h"
  
-/***********************************************************************
- *              Build render functions from dd templates               *
- ***********************************************************************/
-
-
-#define RADEON_XYZW_BIT                0x01
-#define RADEON_RGBA_BIT                0x02
-#define RADEON_SPEC_BIT                0x04
-#define RADEON_TEX0_BIT                0x08
-#define RADEON_TEX1_BIT                0x10
-#define RADEON_PTEX_BIT                0x20
-#define RADEON_MAX_SETUP       0x40
-
-static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
-
-static struct {
-   void                (*emit)( GLcontext *, GLuint, GLuint, void *, GLuint );
-   interp_func         interp;
-   copy_pv_func                copy_pv;
-   GLboolean           (*check_tex_sizes)( GLcontext *ctx );
-   GLuint               vertex_size;
-   GLuint               vertex_stride_shift;
-   GLuint               vertex_format;
-} setup_tab[RADEON_MAX_SETUP];
-
-
-#define TINY_VERTEX_FORMAT             (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_PKCOLOR)
-
-#define NOTEX_VERTEX_FORMAT            (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC)
-
-#define TEX0_VERTEX_FORMAT             (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC |     \
-                                        RADEON_CP_VC_FRMT_ST0)
-
-#define TEX1_VERTEX_FORMAT             (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC |     \
-                                        RADEON_CP_VC_FRMT_ST0 |        \
-                                        RADEON_CP_VC_FRMT_ST1)
-
-#define PROJ_TEX1_VERTEX_FORMAT                (RADEON_CP_VC_FRMT_XY |         \
-                                        RADEON_CP_VC_FRMT_Z |          \
-                                        RADEON_CP_VC_FRMT_W0 |         \
-                                        RADEON_CP_VC_FRMT_PKCOLOR |    \
-                                        RADEON_CP_VC_FRMT_PKSPEC |     \
-                                        RADEON_CP_VC_FRMT_ST0 |        \
-                                        RADEON_CP_VC_FRMT_Q0 |         \
-                                        RADEON_CP_VC_FRMT_ST1 |        \
-                                        RADEON_CP_VC_FRMT_Q1)
-
-#define TEX2_VERTEX_FORMAT 0
-#define TEX3_VERTEX_FORMAT 0
-#define PROJ_TEX3_VERTEX_FORMAT 0
-
-#define DO_XYZW (IND & RADEON_XYZW_BIT)
-#define DO_RGBA (IND & RADEON_RGBA_BIT)
-#define DO_SPEC (IND & RADEON_SPEC_BIT)
-#define DO_FOG  (IND & RADEON_SPEC_BIT)
-#define DO_TEX0 (IND & RADEON_TEX0_BIT)
-#define DO_TEX1 (IND & RADEON_TEX1_BIT)
-#define DO_TEX2 0
-#define DO_TEX3 0
-#define DO_PTEX (IND & RADEON_PTEX_BIT)
-
-#define VERTEX radeonVertex
-#define VERTEX_COLOR radeon_color_t
-#define GET_VIEWPORT_MAT() 0
-#define GET_TEXSOURCE(n)  n
-#define GET_VERTEX_FORMAT() RADEON_CONTEXT(ctx)->swtcl.vertex_format
-#define GET_VERTEX_STORE() RADEON_CONTEXT(ctx)->swtcl.verts
-#define GET_VERTEX_STRIDE_SHIFT() RADEON_CONTEXT(ctx)->swtcl.vertex_stride_shift
-#define GET_UBYTE_COLOR_STORE() &RADEON_CONTEXT(ctx)->UbyteColor
-#define GET_UBYTE_SPEC_COLOR_STORE() &RADEON_CONTEXT(ctx)->UbyteSecondaryColor
-
-#define HAVE_HW_VIEWPORT    1
-/* Tiny vertices don't seem to work atm - haven't looked into why.
- */
-#define HAVE_HW_DIVIDE      (IND & ~(RADEON_XYZW_BIT|RADEON_RGBA_BIT))
-#define HAVE_TINY_VERTICES  1
-#define HAVE_RGBA_COLOR     1
-#define HAVE_NOTEX_VERTICES 1
-#define HAVE_TEX0_VERTICES  1
-#define HAVE_TEX1_VERTICES  1
-#define HAVE_TEX2_VERTICES  0
-#define HAVE_TEX3_VERTICES  0
-#define HAVE_PTEX_VERTICES  1
-
-#define CHECK_HW_DIVIDE    (!(ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE| \
-                                                    DD_TRI_UNFILLED)))
-
-#define IMPORT_QUALIFIER
-#define IMPORT_FLOAT_COLORS radeon_import_float_colors
-#define IMPORT_FLOAT_SPEC_COLORS radeon_import_float_spec_colors
-
-#define INTERP_VERTEX setup_tab[RADEON_CONTEXT(ctx)->swtcl.SetupIndex].interp
-#define COPY_PV_VERTEX setup_tab[RADEON_CONTEXT(ctx)->swtcl.SetupIndex].copy_pv
-
-
-/***********************************************************************
- *         Generate  pv-copying and translation functions              *
- ***********************************************************************/
-
-#define TAG(x) radeon_##x
-#define IND ~0
-#include "tnl_dd/t_dd_vb.c"
-#undef IND
-
-
-/***********************************************************************
- *             Generate vertex emit and interp functions               *
- ***********************************************************************/
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT)
-#define TAG(x) x##_wg
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT)
-#define TAG(x) x##_wgt0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT|RADEON_PTEX_BIT)
-#define TAG(x) x##_wgpt0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT|RADEON_TEX1_BIT)
-#define TAG(x) x##_wgt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_TEX0_BIT|RADEON_TEX1_BIT|\
-             RADEON_PTEX_BIT)
-#define TAG(x) x##_wgpt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT)
-#define TAG(x) x##_wgfs
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT)
-#define TAG(x) x##_wgfst0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT|RADEON_PTEX_BIT)
-#define TAG(x) x##_wgfspt0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT|RADEON_TEX1_BIT)
-#define TAG(x) x##_wgfst0t1
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (RADEON_XYZW_BIT|RADEON_RGBA_BIT|RADEON_SPEC_BIT|\
-            RADEON_TEX0_BIT|RADEON_TEX1_BIT|RADEON_PTEX_BIT)
-#define TAG(x) x##_wgfspt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
  
+/* R100: xyzw, c0, c1/fog, stq[0..2]  = 4+1+1+3*3 = 15  right? */
+/* R200: xyzw, c0, c1/fog, strq[0..5] = 4+1+1+4*6 = 30 */
+#define RADEON_MAX_TNL_VERTEX_SIZE (15 * sizeof(GLfloat))      /* for mesa _tnl stage */
  
  /***********************************************************************
   *                         Initialization 
   ***********************************************************************/
  
-static void init_setup_tab( void )
-{
-   init_wg();
-   init_wgt0();
-   init_wgpt0();
-   init_wgt0t1();
-   init_wgpt0t1();
-   init_wgfs();
-   init_wgfst0();
-   init_wgfspt0();
-   init_wgfst0t1();
-   init_wgfspt0t1();
-}
-
-
-
-void radeonPrintSetupFlags(char *msg, GLuint flags )
-{
-   fprintf(stderr, "%s(%x): %s%s%s%s%s%s\n",
-          msg,
-          (int)flags,
-          (flags & RADEON_XYZW_BIT)      ? " xyzw," : "",
-          (flags & RADEON_RGBA_BIT)     ? " rgba," : "",
-          (flags & RADEON_SPEC_BIT)     ? " spec/fog," : "",
-          (flags & RADEON_TEX0_BIT)     ? " tex-0," : "",
-          (flags & RADEON_TEX1_BIT)     ? " tex-1," : "",
-          (flags & RADEON_PTEX_BIT)     ? " proj-tex," : "");
-}
-
-
-static void radeonRenderStart( GLcontext *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-
-   if (!setup_tab[rmesa->swtcl.SetupIndex].check_tex_sizes(ctx)) {
-      GLuint ind = rmesa->swtcl.SetupIndex |= (RADEON_PTEX_BIT|RADEON_RGBA_BIT);
-
-      /* Projective textures are handled nicely; just have to change
-       * up to the new vertex format.
-       */
-      if (setup_tab[ind].vertex_format != rmesa->swtcl.vertex_format) {
-        RADEON_NEWPRIM(rmesa);
-        rmesa->swtcl.vertex_format = setup_tab[ind].vertex_format;
-        rmesa->swtcl.vertex_size = setup_tab[ind].vertex_size;
-        rmesa->swtcl.vertex_stride_shift = setup_tab[ind].vertex_stride_shift;
-      }
-
-      if (!(ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
-        tnl->Driver.Render.Interp = setup_tab[rmesa->swtcl.SetupIndex].interp;
-        tnl->Driver.Render.CopyPV = setup_tab[rmesa->swtcl.SetupIndex].copy_pv;
-      }
-   }
-   
-   if (rmesa->dma.flush != 0 && 
-       rmesa->dma.flush != flush_last_swtcl_prim)
-      rmesa->dma.flush( rmesa );
-}
+#define EMIT_ATTR( ATTR, STYLE, F0 )                                   \
+do {                                                                   \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);    \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);   \
+   rmesa->radeon.swtcl.vertex_attr_count++;                                    \
+   fmt_0 |= F0;                                                                \
+} while (0)
  
+#define EMIT_PAD( N )                                                  \
+do {                                                                   \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;         \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;  \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);               \
+   rmesa->radeon.swtcl.vertex_attr_count++;                                    \
+} while (0)
  
-void radeonBuildVertices( GLcontext *ctx, GLuint start, GLuint count,
-                          GLuint newinputs )
+static GLuint radeon_cp_vc_frmts[3][2] =
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-   GLubyte *v = ((GLubyte *)rmesa->swtcl.verts + 
-                (start << rmesa->swtcl.vertex_stride_shift));
-   GLuint stride = 1 << rmesa->swtcl.vertex_stride_shift;
-
-   newinputs |= rmesa->swtcl.SetupNewInputs;
-   rmesa->swtcl.SetupNewInputs = 0;
-
-   if (!newinputs)
-      return;
-
-   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, v, stride );
-}
+   { RADEON_CP_VC_FRMT_ST0, RADEON_CP_VC_FRMT_ST0 | RADEON_CP_VC_FRMT_Q0 },
+   { RADEON_CP_VC_FRMT_ST1, RADEON_CP_VC_FRMT_ST1 | RADEON_CP_VC_FRMT_Q1 },
+   { RADEON_CP_VC_FRMT_ST2, RADEON_CP_VC_FRMT_ST2 | RADEON_CP_VC_FRMT_Q2 },
+};
  
-void radeonChooseVertexState( GLcontext *ctx )
+static void radeonSetVertexFormat( GLcontext *ctx )
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
     TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint ind = (RADEON_XYZW_BIT | RADEON_RGBA_BIT);
-
-   if (!rmesa->TclFallback || rmesa->Fallback)
-      return;
+   struct vertex_buffer *VB = &tnl->vb;
+   DECLARE_RENDERINPUTS(index_bitset);
+   int fmt_0 = 0;
+   int offset = 0;
  
-   if (ctx->Fog.Enabled || (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR))
-      ind |= RADEON_SPEC_BIT;
+   RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
  
-   if (ctx->Texture._EnabledUnits & 0x2)
-      /* unit 1 enabled */
-      ind |= RADEON_TEX0_BIT|RADEON_TEX1_BIT;
-   else if (ctx->Texture._EnabledUnits & 0x1)
-      /* unit 0 enabled */
-      ind |= RADEON_TEX0_BIT;
+   /* Important:
+    */
+   if ( VB->NdcPtr != NULL ) {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
+   }
+   else {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+   }
  
-   rmesa->swtcl.SetupIndex = ind;
+   assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+   rmesa->radeon.swtcl.vertex_attr_count = 0;
  
-   if (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED)) {
-      tnl->Driver.Render.Interp = radeon_interp_extras;
-      tnl->Driver.Render.CopyPV = radeon_copy_pv_extras;
+   /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+    * build up a hardware vertex.
+    */
+   if ( !rmesa->swtcl.needproj ||
+        RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {      /* for projtex */
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F, 
+                RADEON_CP_VC_FRMT_XY | RADEON_CP_VC_FRMT_Z | RADEON_CP_VC_FRMT_W0 );
+      offset = 4;
     }
     else {
-      tnl->Driver.Render.Interp = setup_tab[ind].interp;
-      tnl->Driver.Render.CopyPV = setup_tab[ind].copy_pv;
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_3F, 
+                RADEON_CP_VC_FRMT_XY | RADEON_CP_VC_FRMT_Z );
+      offset = 3;
     }
  
-   if (setup_tab[ind].vertex_format != rmesa->swtcl.vertex_format) {
-      RADEON_NEWPRIM(rmesa);
-      rmesa->swtcl.vertex_format = setup_tab[ind].vertex_format;
-      rmesa->swtcl.vertex_size = setup_tab[ind].vertex_size;
-      rmesa->swtcl.vertex_stride_shift = setup_tab[ind].vertex_stride_shift;
-   }
+   rmesa->swtcl.coloroffset = offset;
+#if MESA_LITTLE_ENDIAN 
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, 
+             RADEON_CP_VC_FRMT_PKCOLOR );
+#else
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR,
+             RADEON_CP_VC_FRMT_PKCOLOR );
+#endif
+   offset += 1;
  
-   {
-      GLuint se_coord_fmt, needproj;
-
-      /* HW perspective divide is a win, but tiny vertex formats are a
-       * bigger one.
-       */
-      if (setup_tab[ind].vertex_format == TINY_VERTEX_FORMAT ||
-         (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
-        needproj = GL_TRUE;
-        se_coord_fmt = (RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
-                        RADEON_VTX_Z_PRE_MULT_1_OVER_W0 |
-                        RADEON_TEX1_W_ROUTING_USE_Q1);
+   rmesa->swtcl.specoffset = 0;
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
+       RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+
+#if MESA_LITTLE_ENDIAN 
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+        rmesa->swtcl.specoffset = offset;
+        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB,
+                   RADEON_CP_VC_FRMT_PKSPEC );
        }
        else {
-        needproj = GL_FALSE;
-        se_coord_fmt = (RADEON_VTX_W0_IS_NOT_1_OVER_W0 |
-                        RADEON_TEX1_W_ROUTING_USE_Q1);
+        EMIT_PAD( 3 );
        }
  
-      if ( se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT] ) {
-        RADEON_STATECHANGE( rmesa, set );
-        rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F,
+                   RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+        EMIT_PAD( 1 );
+      }
+#else
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F,
+                   RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+        EMIT_PAD( 1 );
        }
-      _tnl_need_projected_coords( ctx, needproj );
-   }
-}
-
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   rmesa->dma.flush = 0;
  
-   if (rmesa->dma.current.buf) {
-      struct radeon_dma_region *current = &rmesa->dma.current;
-      GLuint current_offset = (rmesa->radeonScreen->gart_buffer_offset +
-                              current->buf->buf->idx * RADEON_BUFFER_SIZE + 
-                              current->start);
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+        rmesa->swtcl.specoffset = offset;
+        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_BGR,
+                   RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+        EMIT_PAD( 3 );
+      }
+#endif
+   }
  
-      assert (!(rmesa->swtcl.hw_primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
+   if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+      int i;
+
+      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+        if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
+           GLuint sz = VB->TexCoordPtr[i]->size;
+
+           switch (sz) {
+           case 1:
+           case 2:
+              EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_2F,
+                         radeon_cp_vc_frmts[i][0] );
+              break;
+           case 3:
+           case 4:
+              if (ctx->Texture.Unit[i]._ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
+                 EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_3F,
+                            radeon_cp_vc_frmts[i][1] );
+              } else {
+                 EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_3F_XYW,
+                            radeon_cp_vc_frmts[i][1] );
+              }
+              break;
+           default:
+              continue;
+           };
+        }
+      }
+   }
  
-      assert (current->start + 
-             rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-             current->ptr);
+   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
+       fmt_0 != rmesa->swtcl.vertex_format) {
+      RADEON_NEWPRIM(rmesa);
+      rmesa->swtcl.vertex_format = fmt_0;
+      rmesa->radeon.swtcl.vertex_size =
+         _tnl_install_attrs( ctx,
+                             rmesa->radeon.swtcl.vertex_attrs, 
+                             rmesa->radeon.swtcl.vertex_attr_count,
+                             NULL, 0 );
+      rmesa->radeon.swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
+      if (RADEON_DEBUG & DEBUG_VERTS)
+        fprintf( stderr, "%s: vertex_size= %d floats\n",
+                 __FUNCTION__, rmesa->radeon.swtcl.vertex_size);
+   }
+}
  
-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-        radeonEmitVertexAOS( rmesa,
-                             rmesa->swtcl.vertex_size,
-                             current_offset);
  
-        radeonEmitVbufPrim( rmesa,
-                            rmesa->swtcl.vertex_format,
-                            rmesa->swtcl.hw_primitive,
-                            rmesa->swtcl.numverts);
-      }
+static void radeonRenderStart( GLcontext *ctx )
+{
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
  
-      rmesa->swtcl.numverts = 0;
-      current->start = current->ptr;
-   }
+   radeonSetVertexFormat( ctx );
+   
+   if (rmesa->radeon.dma.flush != 0 && 
+       rmesa->radeon.dma.flush != rcommon_flush_last_swtcl_prim)
+      rmesa->radeon.dma.flush( ctx );
  }
  
  
-/* Alloc space in the current dma region.
+/**
+ * Set vertex state for SW TCL.  The primary purpose of this function is to
+ * determine in advance whether or not the hardware can / should do the
+ * projection divide or Mesa should do it.
   */
-static __inline void *radeonAllocDmaLowVerts( radeonContextPtr rmesa,
-                                             int nverts, int vsize )
+void radeonChooseVertexState( GLcontext *ctx )
  {
-   GLuint bytes = vsize * nverts;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      radeonRefillCurrentDmaRegion( rmesa );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
  
-   if (!rmesa->dma.flush) {
-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-      rmesa->dma.flush = flush_last_swtcl_prim;
-   }
+   GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+   
+   se_coord_fmt &= ~(RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
+                    RADEON_VTX_Z_PRE_MULT_1_OVER_W0 |
+                    RADEON_VTX_W0_IS_NOT_1_OVER_W0);
  
-   assert( vsize == rmesa->swtcl.vertex_size * 4 );
-   assert( rmesa->dma.flush == flush_last_swtcl_prim );
-   assert (rmesa->dma.current.start + 
-          rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-          rmesa->dma.current.ptr);
+   /* We must ensure that we don't do _tnl_need_projected_coords while in a
+    * rasterization fallback.  As this function will be called again when we
+    * leave a rasterization fallback, we can just skip it for now.
+    */
+   if (rmesa->radeon.Fallback != 0)
+      return;
  
+   /* HW perspective divide is a win, but tiny vertex formats are a
+    * bigger one.
+    */
  
-   {
-      GLubyte *head = (GLubyte *)(rmesa->dma.current.address + rmesa->dma.current.ptr);
-      rmesa->dma.current.ptr += bytes;
-      rmesa->swtcl.numverts += nverts;
-      return head;
+   if ((!RENDERINPUTS_TEST_RANGE( tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX ) &&
+       !RENDERINPUTS_TEST( tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 ))
+       || (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
+      rmesa->swtcl.needproj = GL_TRUE;
+      se_coord_fmt |= (RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
+                     RADEON_VTX_Z_PRE_MULT_1_OVER_W0);
+   }
+   else {
+      rmesa->swtcl.needproj = GL_FALSE;
+      se_coord_fmt |= (RADEON_VTX_W0_IS_NOT_1_OVER_W0);
     }
  
-}
+   _tnl_need_projected_coords( ctx, rmesa->swtcl.needproj );
  
+   if ( se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT] ) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
+   }
+}
  
+void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
+{
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
  
+   rcommonEnsureCmdBufSpace(&rmesa->radeon,
+                           rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
+                           __FUNCTION__);
  
-void radeon_emit_contiguous_verts( GLcontext *ctx, GLuint start, GLuint count )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint vertex_size = rmesa->swtcl.vertex_size * 4;
-   CARD32 *dest = radeonAllocDmaLowVerts( rmesa, count-start, vertex_size );
-   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, dest, 
-                                           vertex_size );
-}
  
+   radeonEmitState(&rmesa->radeon);
+   radeonEmitVertexAOS( rmesa,
+                       rmesa->radeon.swtcl.vertex_size,
+                       rmesa->radeon.dma.current,
+                       current_offset);
  
+                     
+   radeonEmitVbufPrim( rmesa,
+                      rmesa->swtcl.vertex_format,
+                      rmesa->radeon.swtcl.hw_primitive,
+                      rmesa->radeon.swtcl.numverts);
  
-void radeon_emit_indexed_verts( GLcontext *ctx, GLuint start, GLuint count )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   radeonAllocDmaRegionVerts( rmesa, 
-                             &rmesa->swtcl.indexed_verts, 
-                             count - start,
-                             rmesa->swtcl.vertex_size * 4, 
-                             64);
-
-   setup_tab[rmesa->swtcl.SetupIndex].emit( 
-      ctx, start, count, 
-      rmesa->swtcl.indexed_verts.address + rmesa->swtcl.indexed_verts.start, 
-      rmesa->swtcl.vertex_size * 4 );
  }
  
-
  /*
   * Render unclipped vertex buffers by emitting vertices directly to
   * dma buffers.  Use strip/fan hardware primitives where possible.
@@ -483,7 +317,8 @@ void radeon_emit_indexed_verts( GLcontext *ctx, GLuint start, GLuint count )
  #define HAVE_QUADS       0
  #define HAVE_QUAD_STRIPS 0
  #define HAVE_POLYGONS    0
-#define HAVE_ELTS        1
+/* \todo: is it possible to make "ELTS" work with t_vertex code ? */
+#define HAVE_ELTS        0
  
  static const GLuint hw_prim[GL_POLYGON+1] = {
     RADEON_CP_VC_CNTL_PRIM_TYPE_POINT,
@@ -498,118 +333,25 @@ static const GLuint hw_prim[GL_POLYGON+1] = {
     0
  };
  
-static __inline void radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
+static INLINE void
+radeonDmaPrimitive( r100ContextPtr rmesa, GLenum prim )
  {
     RADEON_NEWPRIM( rmesa );
-   rmesa->swtcl.hw_primitive = hw_prim[prim];
-   assert(rmesa->dma.current.ptr == rmesa->dma.current.start);
+   rmesa->radeon.swtcl.hw_primitive = hw_prim[prim];
+   //   assert(rmesa->radeon.dma.current.ptr == rmesa->radeon.dma.current.start);
  }
  
-static __inline void radeonEltPrimitive( radeonContextPtr rmesa, GLenum prim )
-{
-   RADEON_NEWPRIM( rmesa );
-   rmesa->swtcl.hw_primitive = hw_prim[prim] | RADEON_CP_VC_CNTL_PRIM_WALK_IND;
-}
-
-
-static void VERT_FALLBACK( GLcontext *ctx,
-                          GLuint start,
-                          GLuint count,
-                          GLuint flags )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   tnl->Driver.Render.PrimitiveNotify( ctx, flags & PRIM_MODE_MASK );
-   tnl->Driver.Render.BuildVertices( ctx, start, count, ~0 );
-   tnl->Driver.Render.PrimTabVerts[flags&PRIM_MODE_MASK]( ctx, start, count, flags );
-   RADEON_CONTEXT(ctx)->swtcl.SetupNewInputs = VERT_BIT_POS;
-}
-
-static void ELT_FALLBACK( GLcontext *ctx,
-                         GLuint start,
-                         GLuint count,
-                         GLuint flags )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   tnl->Driver.Render.PrimitiveNotify( ctx, flags & PRIM_MODE_MASK );
-   tnl->Driver.Render.BuildVertices( ctx, start, count, ~0 );
-   tnl->Driver.Render.PrimTabElts[flags&PRIM_MODE_MASK]( ctx, start, count, flags );
-   RADEON_CONTEXT(ctx)->swtcl.SetupNewInputs = VERT_BIT_POS;
-}
-
-
-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
-#define ELTS_VARS  GLushort *dest
+#define LOCAL_VARS r100ContextPtr rmesa = R100_CONTEXT(ctx)
  #define INIT( prim ) radeonDmaPrimitive( rmesa, prim )
-#define ELT_INIT(prim) radeonEltPrimitive( rmesa, prim )
-#define NEW_PRIMITIVE()  RADEON_NEWPRIM( rmesa )
-#define NEW_BUFFER()  radeonRefillCurrentDmaRegion( rmesa )
-#define GET_CURRENT_VB_MAX_VERTS() \
-  (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
+#define FLUSH()  RADEON_NEWPRIM( rmesa )
+#define GET_CURRENT_VB_MAX_VERTS()                                     10\
+//  (((int)rmesa->radeon.dma.current.end - (int)rmesa->radeon.dma.current.ptr) / (rmesa->radeon.swtcl.vertex_size*4))
  #define GET_SUBSEQUENT_VB_MAX_VERTS() \
-  ((RADEON_BUFFER_SIZE) / (rmesa->swtcl.vertex_size*4))
-
-#if RADEON_OLD_PACKETS
-# define GET_CURRENT_VB_MAX_ELTS() \
-  ((RADEON_CMD_BUF_SZ - (rmesa->store.cmd_used + 24)) / 2)
-#else
-# define GET_CURRENT_VB_MAX_ELTS() \
-  ((RADEON_CMD_BUF_SZ - (rmesa->store.cmd_used + 16)) / 2)
-#endif
-#define GET_SUBSEQUENT_VB_MAX_ELTS() \
-  ((RADEON_CMD_BUF_SZ - 1024) / 2)
-
-
-
-/* How do you extend an existing primitive?
- */
-#define ALLOC_ELTS(nr)                                                 \
-do {                                                                   \
-   if (rmesa->dma.flush == radeonFlushElts &&                          \
-       rmesa->store.cmd_used + nr*2 < RADEON_CMD_BUF_SZ) {             \
-                                                                       \
-      dest = (GLushort *)(rmesa->store.cmd_buf +                       \
-                         rmesa->store.cmd_used);                       \
-      rmesa->store.cmd_used += nr*2;                                   \
-   }                                                                   \
-   else {                                                              \
-      if (rmesa->dma.flush) {                                          \
-        rmesa->dma.flush( rmesa );                                     \
-      }                                                                        \
-                                                                       \
-      radeonEmitVertexAOS( rmesa,                                      \
-                          rmesa->swtcl.vertex_size,                    \
-                          (rmesa->radeonScreen->gart_buffer_offset +   \
-                           rmesa->swtcl.indexed_verts.buf->buf->idx *  \
-                           RADEON_BUFFER_SIZE +                        \
-                           rmesa->swtcl.indexed_verts.start));         \
-                                                                       \
-      dest = radeonAllocEltsOpenEnded( rmesa,                          \
-                                      rmesa->swtcl.vertex_format,      \
-                                      rmesa->swtcl.hw_primitive,       \
-                                      nr );                            \
-   }                                                                   \
-} while (0)
-
-#define ALLOC_ELTS_NEW_PRIMITIVE(nr) ALLOC_ELTS( nr )
-
-#ifdef MESA_BIG_ENDIAN
-/* We could do without (most of) this ugliness if dest was always 32 bit word aligned... */
-#define EMIT_ELT(offset, x) do {                               \
-       int off = offset + ( ( (GLuint)dest & 0x2 ) >> 1 );     \
-       GLushort *des = (GLushort *)( (GLuint)dest & ~0x2 );    \
-       (des)[ off + 1 - 2 * ( off & 1 ) ] = (GLushort)(x); } while (0)
-#else
-#define EMIT_ELT(offset, x) (dest)[offset] = (GLushort) (x)
-#endif
-#define EMIT_TWO_ELTS(offset, x, y)  *(GLuint *)(dest+offset) = ((y)<<16)|(x);
-#define INCR_ELTS( nr ) dest += nr
-#define RELEASE_ELT_VERTS() \
-  radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ )
-#define EMIT_VERTS( ctx, j, nr ) \
-  radeon_emit_contiguous_verts(ctx, j, (j)+(nr))
-#define EMIT_INDEXED_VERTS( ctx, start, count ) \
-  radeon_emit_indexed_verts( ctx, start, count )
-
+  ((RADEON_BUFFER_SIZE) / (rmesa->radeon.swtcl.vertex_size*4))
+#define ALLOC_VERTS( nr ) \
+  rcommonAllocDmaLowVerts( &rmesa->radeon, nr, rmesa->radeon.swtcl.vertex_size * 4 )
+#define EMIT_VERTS( ctx, j, nr, buf ) \
+  _tnl_emit_vertices_to_buffer(ctx, j, (j)+(nr), buf)
  
  #define TAG(x) radeon_dma_##x
  #include "tnl_dd/t_dd_dmatmp.h"
@@ -623,29 +365,18 @@ do {                                                                      \
  static GLboolean radeon_run_render( GLcontext *ctx,
                                     struct tnl_pipeline_stage *stage )
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
     TNLcontext *tnl = TNL_CONTEXT(ctx);
     struct vertex_buffer *VB = &tnl->vb;
-   GLuint i, length, flags = 0;
-   render_func *tab = TAG(render_tab_verts);
-
-   if (rmesa->swtcl.indexed_verts.buf && (!VB->Elts || stage->changed_inputs)) 
-      RELEASE_ELT_VERTS();
-       
-   if (VB->ClipOrMask ||            /* No clipping */
-       rmesa->swtcl.RenderIndex != 0 ||    /* No per-vertex manipulations */
-       ctx->Line.StippleFlag)        /* GH: THIS IS A HACK!!! */
+   tnl_render_func *tab = TAG(render_tab_verts);
+   GLuint i;
+
+   if (rmesa->radeon.swtcl.RenderIndex != 0 ||   
+       !radeon_dma_validate_render( ctx, VB ))
        return GL_TRUE;          
  
     tnl->Driver.Render.Start( ctx );
  
-   if (VB->Elts) {
-      tab = TAG(render_tab_elts);
-      if (!rmesa->swtcl.indexed_verts.buf)
-        if (!TAG(emit_elt_verts)(ctx, 0, VB->Count))
-           return GL_TRUE;     /* too many vertices */
-   }
-
     for (i = 0 ; i < VB->PrimitiveCount ; i++)
     {
        GLuint prim = VB->Primitive[i].mode;
@@ -656,12 +387,12 @@ static GLboolean radeon_run_render( GLcontext *ctx,
          continue;
  
        if (RADEON_DEBUG & DEBUG_PRIMS)
-        fprintf(stderr, "r200_render.c: prim %s %d..%d\n", 
+        fprintf(stderr, "radeon_render.c: prim %s %d..%d\n", 
                  _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
                  start, start+length);
  
        if (length)
-        tab[prim & PRIM_MODE_MASK]( ctx, start, start + length, flags );
+        tab[prim & PRIM_MODE_MASK]( ctx, start, start + length, prim );
     }
  
     tnl->Driver.Render.Finish( ctx );
@@ -671,179 +402,17 @@ static GLboolean radeon_run_render( GLcontext *ctx,
  
  
  
-static void radeon_check_render( GLcontext *ctx,
-                                struct tnl_pipeline_stage *stage )
-{
-   GLuint inputs = VERT_BIT_POS | VERT_BIT_COLOR0;
-
-   if (ctx->RenderMode == GL_RENDER) {
-      if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR)
-        inputs |= VERT_BIT_COLOR1;
-
-      if (ctx->Texture.Unit[0]._ReallyEnabled)
-        inputs |= VERT_BIT_TEX0;
-
-      if (ctx->Texture.Unit[1]._ReallyEnabled)
-        inputs |= VERT_BIT_TEX1;
-
-      if (ctx->Fog.Enabled)
-        inputs |= VERT_BIT_FOG;
-   }
-
-   stage->inputs = inputs;
-}
-
-
-static void dtr( struct tnl_pipeline_stage *stage )
-{
-   (void)stage;
-}
-
-
  const struct tnl_pipeline_stage _radeon_render_stage =
  {
     "radeon render",
-   (_DD_NEW_SEPARATE_SPECULAR |
-    _NEW_TEXTURE|
-    _NEW_FOG|
-    _NEW_RENDERMODE),          /* re-check (new inputs) */
-   0,                          /* re-run (always runs) */
-   GL_TRUE,                    /* active */
-   0, 0,                       /* inputs (set in check_render), outputs */
-   0, 0,                       /* changed_inputs, private */
-   dtr,                                /* destructor */
-   radeon_check_render,                /* check - initially set to alloc data */
+   NULL,
+   NULL,
+   NULL,
+   NULL,
     radeon_run_render           /* run */
  };
  
  
-/**************************************************************************/
-
-/* Radeon texture rectangle expects coords in 0..1 range, not 0..dimension
- * as in the extension spec.  Need to translate here.
- *
- * Note that swrast expects 0..dimension, so if a fallback is active,
- * don't do anything.  (Maybe need to configure swrast to match hw)
- */
-struct texrect_stage_data {
-   GLvector4f texcoord[MAX_TEXTURE_UNITS];
-};
-
-#define TEXRECT_STAGE_DATA(stage) ((struct texrect_stage_data *)stage->privatePtr)
-
-
-static GLboolean run_texrect_stage( GLcontext *ctx,
-                                   struct tnl_pipeline_stage *stage )
-{
-   struct texrect_stage_data *store = TEXRECT_STAGE_DATA(stage);
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   GLuint i;
-
-   if (rmesa->Fallback)
-      return GL_TRUE;
-
-   for (i = 0 ; i < ctx->Const.MaxTextureUnits ; i++) {
-      if (!(ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_RECT_BIT))
-        continue;
-   
-      if (stage->changed_inputs & VERT_BIT_TEX(i)) {
-        struct gl_texture_object *texObj = ctx->Texture.Unit[i].CurrentRect;
-        struct gl_texture_image *texImage = texObj->Image[texObj->BaseLevel];
-        const GLfloat iw = 1.0/texImage->Width;
-        const GLfloat ih = 1.0/texImage->Height;
-        GLfloat *in = (GLfloat *)VB->TexCoordPtr[i]->data;
-        GLint instride = VB->TexCoordPtr[i]->stride;
-        GLfloat (*out)[4] = store->texcoord[i].data;
-        GLint j;
-        
-        for (j = 0 ; j < VB->Count ; j++) {
-           out[j][0] = in[0] * iw;
-           out[j][1] = in[1] * ih;
-           in = (GLfloat *)((GLubyte *)in + instride);
-        }
-      }
-
-      VB->TexCoordPtr[i] = &store->texcoord[i];
-   }
-
-   return GL_TRUE;
-}
-
-
-/* Called the first time stage->run() is invoked.
- */
-static GLboolean alloc_texrect_data( GLcontext *ctx,
-                                    struct tnl_pipeline_stage *stage )
-{
-   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
-   struct texrect_stage_data *store;
-   GLuint i;
-
-   stage->privatePtr = CALLOC(sizeof(*store));
-   store = TEXRECT_STAGE_DATA(stage);
-   if (!store)
-      return GL_FALSE;
-
-   for (i = 0 ; i < ctx->Const.MaxTextureUnits ; i++)
-      _mesa_vector4f_alloc( &store->texcoord[i], 0, VB->Size, 32 );
-
-   /* Now run the stage.
-    */
-   stage->run = run_texrect_stage;
-   return stage->run( ctx, stage );
-}
-
-
-static void check_texrect( GLcontext *ctx,
-                          struct tnl_pipeline_stage *stage )
-{
-   GLuint flags = 0;
-
-   if (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT)
-      flags |= VERT_BIT_TEX0;
-
-   if (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT)
-      flags |= VERT_BIT_TEX1;
-
-   stage->inputs = flags;
-   stage->outputs = flags;
-   stage->active = (flags != 0);
-}
-
-
-static void free_texrect_data( struct tnl_pipeline_stage *stage )
-{
-   struct texrect_stage_data *store = TEXRECT_STAGE_DATA(stage);
-   GLuint i;
-
-   if (store) {
-      for (i = 0 ; i < MAX_TEXTURE_UNITS ; i++)
-        if (store->texcoord[i].data)
-           _mesa_vector4f_free( &store->texcoord[i] );
-      FREE( store );
-      stage->privatePtr = 0;
-   }
-}
-
-
-const struct tnl_pipeline_stage _radeon_texrect_stage =
-{
-   "radeon texrect stage",                     /* name */
-   _NEW_TEXTURE,       /* check_state */
-   _NEW_TEXTURE,       /* run_state */
-   GL_TRUE,                            /* active? */
-   0,                                  /* inputs */
-   0,                                  /* outputs */
-   0,                                  /* changed_inputs */
-   NULL,                               /* private data */
-   free_texrect_data,                  /* destructor */
-   check_texrect,                      /* check */
-   alloc_texrect_data,                 /* run -- initially set to init */
-};
-
-
  /**************************************************************************/
  
  
@@ -870,16 +439,15 @@ static void radeonResetLineStipple( GLcontext *ctx );
   ***********************************************************************/
  
  #undef LOCAL_VARS
-#define CTX_ARG radeonContextPtr rmesa
-#define CTX_ARG2 rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) radeonAllocDmaLowVerts( rmesa, n, size * 4 )
+#undef ALLOC_VERTS
+#define CTX_ARG r100ContextPtr rmesa
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, (size) * 4 )
  #undef LOCAL_VARS
  #define LOCAL_VARS                                             \
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);               \
-   const GLuint shift = rmesa->swtcl.vertex_stride_shift;      \
-   const char *radeonverts = (char *)rmesa->swtcl.verts;
-#define VERT(x) (radeonVertex *)(radeonverts + (x << shift))
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);           \
+   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;
+#define VERT(x) (radeonVertex *)(radeonverts + ((x) * (vertsize) * sizeof(int)))
  #define VERTEX radeonVertex 
  #undef TAG
  #define TAG(x) radeon_##x
@@ -901,14 +469,14 @@ static void radeonResetLineStipple( GLcontext *ctx );
  
  #define RADEON_TWOSIDE_BIT     0x01
  #define RADEON_UNFILLED_BIT    0x02
-#define RADEON_MAX_TRIFUNC     0x08
+#define RADEON_MAX_TRIFUNC     0x04
  
  
  static struct {
-   points_func         points;
-   line_func           line;
-   triangle_func       triangle;
-   quad_func           quad;
+   tnl_points_func             points;
+   tnl_line_func               line;
+   tnl_triangle_func   triangle;
+   tnl_quad_func               quad;
  } rast_tab[RADEON_MAX_TRIFUNC];
  
  
@@ -925,7 +493,6 @@ static struct {
  
  #define HAVE_RGBA   1
  #define HAVE_SPEC   1
-#define HAVE_INDEX  0
  #define HAVE_BACK_COLORS  0
  #define HAVE_HW_FLATSHADE 1
  #define TAB rast_tab
@@ -937,7 +504,7 @@ static struct {
  #define VERT_Y(_v) _v->v.y
  #define VERT_Z(_v) _v->v.z
  #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e<<rmesa->swtcl.vertex_stride_shift))
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + ((e) * rmesa->radeon.swtcl.vertex_size * sizeof(int)))
  
  #define VERT_SET_RGBA( v, c )                                          \
  do {                                                           \
@@ -950,20 +517,23 @@ do {                                                              \
  
  #define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
  
-#define VERT_SET_SPEC( v0, c )                                 \
+#define VERT_SET_SPEC( v, c )                                  \
  do {                                                           \
-   if (havespec) {                                             \
-      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.red, (c)[0]);    \
-      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.green, (c)[1]);  \
-      UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.blue, (c)[2]);   \
+   if (specoffset) {                                           \
+      radeon_color_t *spec = (radeon_color_t *)&((v)->ui[specoffset]); \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);     \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);   \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);    \
     }                                                           \
  } while (0)
  #define VERT_COPY_SPEC( v0, v1 )                       \
  do {                                                   \
-   if (havespec) {                                     \
-      v0->v.specular.red   = v1->v.specular.red;       \
-      v0->v.specular.green = v1->v.specular.green;     \
-      v0->v.specular.blue  = v1->v.specular.blue;      \
+   if (specoffset) {                                   \
+      radeon_color_t *spec0 = (radeon_color_t *)&((v0)->ui[specoffset]);       \
+      radeon_color_t *spec1 = (radeon_color_t *)&((v1)->ui[specoffset]);       \
+      spec0->red   = spec1->red;       \
+      spec0->green = spec1->green;     \
+      spec0->blue  = spec1->blue;      \
     }                                                   \
  } while (0)
  
@@ -972,26 +542,26 @@ do {                                                      \
   */
  #define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
  #define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
-#define VERT_SAVE_SPEC( idx )    if (havespec) spec[idx] = v[idx]->ui[5]
-#define VERT_RESTORE_SPEC( idx ) if (havespec) v[idx]->ui[5] = spec[idx]
+#define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
+#define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
  
  #undef LOCAL_VARS
  #undef TAG
  #undef INIT
  
  #define LOCAL_VARS(n)                                                  \
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);                       \
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);                   \
     GLuint color[n], spec[n];                                           \
-   GLuint coloroffset = (rmesa->swtcl.vertex_size == 4 ? 3 : 4);       \
-   GLboolean havespec = (rmesa->swtcl.vertex_size > 4);                        \
-   (void) color; (void) spec; (void) coloroffset; (void) havespec;
+   GLuint coloroffset = rmesa->swtcl.coloroffset;      \
+   GLuint specoffset = rmesa->swtcl.specoffset;                        \
+   (void) color; (void) spec; (void) coloroffset; (void) specoffset;
  
  /***********************************************************************
   *                Helpers for rendering unfilled primitives            *
   ***********************************************************************/
  
  #define RASTERIZE(x) radeonRasterPrimitive( ctx, reduced_hw_prim[x] )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
  #undef TAG
  #define TAG(x) x
  #include "tnl_dd/t_dd_unfilled.h"
@@ -1032,7 +602,6 @@ static void init_rast_tab( void )
  /*               Render unclipped begin/end objects                   */
  /**********************************************************************/
  
-#define VERT(x) (radeonVertex *)(radeonverts + (x << shift))
  #define RENDER_POINTS( start, count )          \
     for ( ; start < count ; start++)            \
        radeon_point( rmesa, VERT(start) )
@@ -1048,9 +617,9 @@ static void init_rast_tab( void )
  } while (0)
  #undef LOCAL_VARS
  #define LOCAL_VARS                                             \
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);               \
-   const GLuint shift = rmesa->swtcl.vertex_stride_shift;              \
-   const char *radeonverts = (char *)rmesa->swtcl.verts;               \
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);           \
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;            \
+   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;                \
     const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;       \
     const GLboolean stipple = ctx->Line.StippleFlag;            \
     (void) elt; (void) stipple;
@@ -1075,17 +644,17 @@ static void init_rast_tab( void )
  void radeonChooseRenderState( GLcontext *ctx )
  {
     TNLcontext *tnl = TNL_CONTEXT(ctx);
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
     GLuint index = 0;
     GLuint flags = ctx->_TriangleCaps;
  
-   if (!rmesa->TclFallback || rmesa->Fallback) 
+   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback) 
        return;
  
     if (flags & DD_TRI_LIGHT_TWOSIDE) index |= RADEON_TWOSIDE_BIT;
     if (flags & DD_TRI_UNFILLED)      index |= RADEON_UNFILLED_BIT;
  
-   if (index != rmesa->swtcl.RenderIndex) {
+   if (index != rmesa->radeon.swtcl.RenderIndex) {
        tnl->Driver.Render.Points = rast_tab[index].points;
        tnl->Driver.Render.Line = rast_tab[index].line;
        tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -1102,7 +671,7 @@ void radeonChooseRenderState( GLcontext *ctx )
          tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
        }
  
-      rmesa->swtcl.RenderIndex = index;
+      rmesa->radeon.swtcl.RenderIndex = index;
     }
  }
  
@@ -1114,18 +683,18 @@ void radeonChooseRenderState( GLcontext *ctx )
  
  static void radeonRasterPrimitive( GLcontext *ctx, GLuint hwprim )
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
  
-   if (rmesa->swtcl.hw_primitive != hwprim) {
+   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
        RADEON_NEWPRIM( rmesa );
-      rmesa->swtcl.hw_primitive = hwprim;
+      rmesa->radeon.swtcl.hw_primitive = hwprim;
     }
  }
  
  static void radeonRenderPrimitive( GLcontext *ctx, GLenum prim )
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   rmesa->swtcl.render_primitive = prim;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   rmesa->radeon.swtcl.render_primitive = prim;
     if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
        radeonRasterPrimitive( ctx, reduced_hw_prim[prim] );
  }
@@ -1136,7 +705,7 @@ static void radeonRenderFinish( GLcontext *ctx )
  
  static void radeonResetLineStipple( GLcontext *ctx )
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
     RADEON_STATECHANGE( rmesa, lin );
  }
  
@@ -1170,18 +739,17 @@ static const char *getFallbackString(GLuint bit)
  
  void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
     TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->Fallback;
+   GLuint oldfallback = rmesa->radeon.Fallback;
  
     if (mode) {
-      rmesa->Fallback |= bit;
+      rmesa->radeon.Fallback |= bit;
        if (oldfallback == 0) {
-        RADEON_FIREVERTICES( rmesa );
+        radeon_firevertices(&rmesa->radeon);
          TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_TRUE );
          _swsetup_Wakeup( ctx );
-        _tnl_need_projected_coords( ctx, GL_TRUE );
-        rmesa->swtcl.RenderIndex = ~0;
+        rmesa->radeon.swtcl.RenderIndex = ~0;
           if (RADEON_DEBUG & DEBUG_FALLBACKS) {
              fprintf(stderr, "Radeon begin rasterization fallback: 0x%x %s\n",
                      bit, getFallbackString(bit));
@@ -1189,20 +757,27 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
        }
     }
     else {
-      rmesa->Fallback &= ~bit;
+      rmesa->radeon.Fallback &= ~bit;
        if (oldfallback == bit) {
          _swrast_flush( ctx );
          tnl->Driver.Render.Start = radeonRenderStart;
          tnl->Driver.Render.PrimitiveNotify = radeonRenderPrimitive;
          tnl->Driver.Render.Finish = radeonRenderFinish;
-        tnl->Driver.Render.BuildVertices = radeonBuildVertices;
+
+        tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+        tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+        tnl->Driver.Render.Interp = _tnl_interp;
+
          tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
          TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_FALSE );
-        if (rmesa->TclFallback) {
-           /* These are already done if rmesa->TclFallback goes to
+        if (rmesa->radeon.TclFallback) {
+           /* These are already done if rmesa->radeon.TclFallback goes to
              * zero above. But not if it doesn't (RADEON_NO_TCL for
              * example?)
              */
+           _tnl_invalidate_vertex_state( ctx, ~0 );
+           _tnl_invalidate_vertices( ctx, ~0 );
+           RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
             radeonChooseVertexState( ctx );
             radeonChooseRenderState( ctx );
          }
@@ -1215,14 +790,6 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
  }
  
  
-void radeonFlushVertices( GLcontext *ctx, GLuint flags )
-{
-   _tnl_FlushVertices( ctx, flags );
-
-   if (flags & FLUSH_STORED_VERTICES)
-      RADEON_NEWPRIM( RADEON_CONTEXT( ctx ) );
-}
-
  /**********************************************************************/
  /*                            Initialization.                         */
  /**********************************************************************/
@@ -1230,13 +797,11 @@ void radeonFlushVertices( GLcontext *ctx, GLuint flags )
  void radeonInitSwtcl( GLcontext *ctx )
  {
     TNLcontext *tnl = TNL_CONTEXT(ctx);
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint size = TNL_CONTEXT(ctx)->vb.Size;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
     static int firsttime = 1;
  
     if (firsttime) {
        init_rast_tab();
-      init_setup_tab();
        firsttime = 0;
     }
  
@@ -1244,26 +809,22 @@ void radeonInitSwtcl( GLcontext *ctx )
     tnl->Driver.Render.Finish = radeonRenderFinish;
     tnl->Driver.Render.PrimitiveNotify = radeonRenderPrimitive;
     tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
-   tnl->Driver.Render.BuildVertices = radeonBuildVertices;
+   tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+   tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+   tnl->Driver.Render.Interp = _tnl_interp;
  
-   rmesa->swtcl.verts = (GLubyte *)ALIGN_MALLOC( size * 16 * 4, 32 );
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
-   rmesa->swtcl.hw_primitive = 0;
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+                      RADEON_MAX_TNL_VERTEX_SIZE);
+   
+   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->radeon.swtcl.hw_primitive = 0;
  }
  
  
  void radeonDestroySwtcl( GLcontext *ctx )
  {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (rmesa->swtcl.indexed_verts.buf) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
-                             __FUNCTION__ );
-
-   if (rmesa->swtcl.verts) {
-      ALIGN_FREE(rmesa->swtcl.verts);
-      rmesa->swtcl.verts = 0;
-   }
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
  
  }