r200/r300: swtcl fixups to use old dma buffers on top of BOs
[mesa.git] / src / mesa / drivers / dri / r200 / r200_swtcl.c
index ca5c456889d25e1da41d0a31cd38eeb8572706b2..a39072d1d2bf1c160e429dd8c2a7706a11333d58 100644 (file)
@@ -1,4 +1,3 @@
-/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_swtcl.c,v 1.5 2003/05/06 23:52:08 daenzer Exp $ */
 /*
 Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
 
@@ -33,13 +32,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *   Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "glheader.h"
-#include "mtypes.h"
-#include "colormac.h"
-#include "enums.h"
-#include "image.h"
-#include "imports.h"
-#include "macros.h"
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/imports.h"
+#include "main/macros.h"
 
 #include "swrast/s_context.h"
 #include "swrast/s_fog.h"
@@ -49,729 +48,302 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
 
+#include "radeon_bo.h"
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_state.h"
 #include "r200_swtcl.h"
 #include "r200_tcl.h"
 
-/***********************************************************************
- *              Build render functions from dd templates               *
- ***********************************************************************/
-
-
-#define R200_XYZW_BIT          0x01
-#define R200_RGBA_BIT          0x02
-#define R200_SPEC_BIT          0x04
-#define R200_TEX0_BIT          0x08
-#define R200_TEX1_BIT          0x10
-#define R200_PTEX_BIT          0x20
-#define R200_MAX_SETUP 0x40
-
-static void flush_last_swtcl_prim( r200ContextPtr rmesa  );
-
-static struct {
-   void                (*emit)( GLcontext *, GLuint, GLuint, void *, GLuint );
-   interp_func         interp;
-   copy_pv_func                copy_pv;
-   GLboolean           (*check_tex_sizes)( GLcontext *ctx );
-   GLuint               vertex_size;
-   GLuint               vertex_stride_shift;
-   GLuint               vertex_format;
-} setup_tab[R200_MAX_SETUP];
-
-
-static int se_vtx_fmt_0[] = {
-   0,
-
-   (R200_VTX_XY |
-    R200_VTX_Z0 |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT)),
-
-   (R200_VTX_XY |
-    R200_VTX_Z0 |
-    R200_VTX_W0 |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT)),
-
-   (R200_VTX_XY |
-    R200_VTX_Z0 |
-    R200_VTX_W0 |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT)),
-
-   (R200_VTX_XY |
-    R200_VTX_Z0 |
-    R200_VTX_W0 |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT)),
-
-   (R200_VTX_XY |
-    R200_VTX_Z0 |
-    R200_VTX_W0 |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) |
-    (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT))
-};
-
-static int se_vtx_fmt_1[] = {
-   0,
-   0,
-   0,
-   ((2 << R200_VTX_TEX0_COMP_CNT_SHIFT)),
-   ((2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
-    (2 << R200_VTX_TEX1_COMP_CNT_SHIFT)),
-   ((3 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
-    (3 << R200_VTX_TEX1_COMP_CNT_SHIFT)),
-};
-
-#define TINY_VERTEX_FORMAT     1
-#define NOTEX_VERTEX_FORMAT    2
-#define TEX0_VERTEX_FORMAT     3
-#define TEX1_VERTEX_FORMAT     4
-#define PROJ_TEX1_VERTEX_FORMAT        5
-#define TEX2_VERTEX_FORMAT 0
-#define TEX3_VERTEX_FORMAT 0
-#define PROJ_TEX3_VERTEX_FORMAT 0
-
-#define DO_XYZW (IND & R200_XYZW_BIT)
-#define DO_RGBA (IND & R200_RGBA_BIT)
-#define DO_SPEC (IND & R200_SPEC_BIT)
-#define DO_FOG  (IND & R200_SPEC_BIT)
-#define DO_TEX0 (IND & R200_TEX0_BIT)
-#define DO_TEX1 (IND & R200_TEX1_BIT)
-#define DO_TEX2 0
-#define DO_TEX3 0
-#define DO_PTEX (IND & R200_PTEX_BIT)
-
-#define VERTEX r200Vertex
-#define VERTEX_COLOR r200_color_t
-#define GET_VIEWPORT_MAT() 0
-#define GET_TEXSOURCE(n)  n
-#define GET_VERTEX_FORMAT() R200_CONTEXT(ctx)->swtcl.vertex_format
-#define GET_VERTEX_STORE() R200_CONTEXT(ctx)->swtcl.verts
-#define GET_VERTEX_STRIDE_SHIFT() R200_CONTEXT(ctx)->swtcl.vertex_stride_shift
-#define GET_UBYTE_COLOR_STORE() &R200_CONTEXT(ctx)->UbyteColor
-#define GET_UBYTE_SPEC_COLOR_STORE() &R200_CONTEXT(ctx)->UbyteSecondaryColor
-
-#define HAVE_HW_VIEWPORT    1
-#define HAVE_HW_DIVIDE      (IND & ~(R200_XYZW_BIT|R200_RGBA_BIT))
-#define HAVE_TINY_VERTICES  1
-#define HAVE_RGBA_COLOR     1
-#define HAVE_NOTEX_VERTICES 1
-#define HAVE_TEX0_VERTICES  1
-#define HAVE_TEX1_VERTICES  1
-#define HAVE_TEX2_VERTICES  0
-#define HAVE_TEX3_VERTICES  0
-#define HAVE_PTEX_VERTICES  1
-
-#define CHECK_HW_DIVIDE    (!(ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE| \
-                                                    DD_TRI_UNFILLED)))
-
-#define IMPORT_QUALIFIER
-#define IMPORT_FLOAT_COLORS r200_import_float_colors
-#define IMPORT_FLOAT_SPEC_COLORS r200_import_float_spec_colors
-
-#define INTERP_VERTEX setup_tab[R200_CONTEXT(ctx)->swtcl.SetupIndex].interp
-#define COPY_PV_VERTEX setup_tab[R200_CONTEXT(ctx)->swtcl.SetupIndex].copy_pv
-
-
-/***********************************************************************
- *         Generate  pv-copying and translation functions              *
- ***********************************************************************/
-
-#define TAG(x) r200_##x
-#define IND ~0
-#include "tnl_dd/t_dd_vb.c"
-#undef IND
-
-
-/***********************************************************************
- *             Generate vertex emit and interp functions               *
- ***********************************************************************/
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT)
-#define TAG(x) x##_wg
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_TEX0_BIT)
-#define TAG(x) x##_wgt0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_TEX0_BIT|R200_PTEX_BIT)
-#define TAG(x) x##_wgpt0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_TEX0_BIT|R200_TEX1_BIT)
-#define TAG(x) x##_wgt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_TEX0_BIT|R200_TEX1_BIT|\
-             R200_PTEX_BIT)
-#define TAG(x) x##_wgpt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_SPEC_BIT)
-#define TAG(x) x##_wgfs
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_SPEC_BIT|\
-            R200_TEX0_BIT)
-#define TAG(x) x##_wgfst0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_SPEC_BIT|\
-            R200_TEX0_BIT|R200_PTEX_BIT)
-#define TAG(x) x##_wgfspt0
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_SPEC_BIT|\
-            R200_TEX0_BIT|R200_TEX1_BIT)
-#define TAG(x) x##_wgfst0t1
-#include "tnl_dd/t_dd_vbtmp.h"
-
-#define IND (R200_XYZW_BIT|R200_RGBA_BIT|R200_SPEC_BIT|\
-            R200_TEX0_BIT|R200_TEX1_BIT|R200_PTEX_BIT)
-#define TAG(x) x##_wgfspt0t1
-#include "tnl_dd/t_dd_vbtmp.h"
-
 
 /***********************************************************************
  *                         Initialization 
  ***********************************************************************/
 
-static void init_setup_tab( void )
-{
-   init_wg();
-   init_wgt0();
-   init_wgpt0();
-   init_wgt0t1();
-   init_wgpt0t1();
-   init_wgfs();
-   init_wgfst0();
-   init_wgfspt0();
-   init_wgfst0t1();
-   init_wgfspt0t1();
-}
-
-
-
-void r200PrintSetupFlags(char *msg, GLuint flags )
-{
-   fprintf(stderr, "%s(%x): %s%s%s%s%s%s\n",
-          msg,
-          (int)flags,
-          (flags & R200_XYZW_BIT)      ? " xyzw," : "",
-          (flags & R200_RGBA_BIT)     ? " rgba," : "",
-          (flags & R200_SPEC_BIT)     ? " spec/fog," : "",
-          (flags & R200_TEX0_BIT)     ? " tex-0," : "",
-          (flags & R200_TEX1_BIT)     ? " tex-1," : "",
-          (flags & R200_PTEX_BIT)     ? " proj-tex," : "");
-}
-
+#define EMIT_ATTR( ATTR, STYLE, F0 )                                   \
+do {                                                                   \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);    \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);   \
+   rmesa->radeon.swtcl.vertex_attr_count++;                                    \
+   fmt_0 |= F0;                                                                \
+} while (0)
 
+#define EMIT_PAD( N )                                                  \
+do {                                                                   \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;         \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;  \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);               \
+   rmesa->radeon.swtcl.vertex_attr_count++;                                    \
+} while (0)
 
-static void r200SetVertexFormat( GLcontext *ctx, GLuint ind ) 
+static void r200SetVertexFormat( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   DECLARE_RENDERINPUTS(index_bitset);
+   int fmt_0 = 0;
+   int fmt_1 = 0;
+   int offset = 0;
 
-   rmesa->swtcl.SetupIndex = ind;
+   RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
 
-   if (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED)) {
-      tnl->Driver.Render.Interp = r200_interp_extras;
-      tnl->Driver.Render.CopyPV = r200_copy_pv_extras;
+   /* Important:
+    */
+   if ( VB->NdcPtr != NULL ) {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
    }
    else {
-      tnl->Driver.Render.Interp = setup_tab[ind].interp;
-      tnl->Driver.Render.CopyPV = setup_tab[ind].copy_pv;
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
    }
 
-   if (setup_tab[ind].vertex_format != rmesa->swtcl.vertex_format) {
-      int i;
-      R200_NEWPRIM(rmesa);
-      i = rmesa->swtcl.vertex_format = setup_tab[ind].vertex_format;
-      rmesa->swtcl.vertex_size = setup_tab[ind].vertex_size;
-      rmesa->swtcl.vertex_stride_shift = setup_tab[ind].vertex_stride_shift;
+   assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+   rmesa->radeon.swtcl.vertex_attr_count = 0;
 
-      R200_STATECHANGE( rmesa, vtx );
-      rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = se_vtx_fmt_0[i];
-      rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = se_vtx_fmt_1[i];
+   /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+    * build up a hardware vertex.
+    */
+   if ( !rmesa->swtcl.needproj ||
+       RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) { /* need w coord for projected textures */
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F, R200_VTX_XY | R200_VTX_Z0 | R200_VTX_W0 );
+      offset = 4;
+   }
+   else {
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_3F, R200_VTX_XY | R200_VTX_Z0 );
+      offset = 3;
    }
 
-   {
-      GLuint vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
-      GLuint vap = rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL];
-      GLuint needproj;
-
-      /* HW perspective divide is a win, but tiny vertex formats are a
-       * bigger one.
-       */
-      if (setup_tab[ind].vertex_format == TINY_VERTEX_FORMAT ||
-         (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
-        needproj = GL_TRUE;
-        vte |= R200_VTX_XY_FMT | R200_VTX_Z_FMT;
-        vte &= ~R200_VTX_W0_FMT;
-        vap |= R200_VAP_FORCE_W_TO_ONE;
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POINTSIZE )) {
+      EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F, R200_VTX_POINT_SIZE );
+      offset += 1;
+   }
+
+   rmesa->swtcl.coloroffset = offset;
+#if MESA_LITTLE_ENDIAN 
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
+#else
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
+#endif
+   offset += 1;
+
+   rmesa->swtcl.specoffset = 0;
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
+       RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+
+#if MESA_LITTLE_ENDIAN 
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+        rmesa->swtcl.specoffset = offset;
+        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
       }
       else {
-        needproj = GL_FALSE;
-        vte &= ~(R200_VTX_XY_FMT | R200_VTX_Z_FMT);
-        vte |= R200_VTX_W0_FMT;
-        vap &= ~R200_VAP_FORCE_W_TO_ONE;
+        EMIT_PAD( 3 );
       }
 
-      _tnl_need_projected_coords( ctx, needproj );
-      if (vte != rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL]) {
-        R200_STATECHANGE( rmesa, vte );
-        rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] = vte;
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
       }
-      if (vap != rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL]) {
-        R200_STATECHANGE( rmesa, vap );
-        rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] = vap;
+      else {
+        EMIT_PAD( 1 );
+      }
+#else
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
+      }
+      else {
+        EMIT_PAD( 1 );
       }
-   }
-}
-
-static void r200RenderStart( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
 
-   if (!setup_tab[rmesa->swtcl.SetupIndex].check_tex_sizes(ctx)) {
-      r200SetVertexFormat( ctx, rmesa->swtcl.SetupIndex | R200_PTEX_BIT);
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+        rmesa->swtcl.specoffset = offset;
+        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_BGR, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
+      }
+      else {
+        EMIT_PAD( 3 );
+      }
+#endif
    }
-   
-   if (rmesa->dma.flush != 0 && 
-       rmesa->dma.flush != flush_last_swtcl_prim)
-      rmesa->dma.flush( rmesa );
-}
 
+   if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+      int i;
 
-void r200BuildVertices( GLcontext *ctx, GLuint start, GLuint count,
-                          GLuint newinputs )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-   GLubyte *v = ((GLubyte *)rmesa->swtcl.verts + 
-                (start << rmesa->swtcl.vertex_stride_shift));
-   GLuint stride = 1 << rmesa->swtcl.vertex_stride_shift;
+      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+        if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
+           GLuint sz = VB->TexCoordPtr[i]->size;
 
-   newinputs |= rmesa->swtcl.SetupNewInputs;
-   rmesa->swtcl.SetupNewInputs = 0;
+           fmt_1 |= sz << (3 * i);
+           EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1, 0 );
+        }
+      }
+   }
 
-   if (!newinputs)
-      return;
+   if ( (rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] & R200_FOG_USE_MASK)
+      != R200_FOG_USE_SPEC_ALPHA ) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_USE_MASK;
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= R200_FOG_USE_SPEC_ALPHA;
+   }
 
-   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, v, stride );
+   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
+       (rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0) ||
+       (rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
+      R200_NEWPRIM(rmesa);
+      R200_STATECHANGE( rmesa, vtx );
+      rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = fmt_0;
+      rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = fmt_1;
+
+      rmesa->radeon.swtcl.vertex_size =
+         _tnl_install_attrs( ctx,
+                             rmesa->radeon.swtcl.vertex_attrs, 
+                             rmesa->radeon.swtcl.vertex_attr_count,
+                             NULL, 0 );
+      rmesa->radeon.swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
+   }
 }
 
 
-void r200ChooseVertexState( GLcontext *ctx )
+static void r200RenderStart( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT( ctx );
-   GLuint ind = (R200_XYZW_BIT | R200_RGBA_BIT);
-
-   if (!rmesa->TclFallback || rmesa->Fallback)
-      return;
-
-   if (ctx->Fog.Enabled || (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR))
-      ind |= R200_SPEC_BIT;
-
-   if (ctx->Texture._EnabledUnits & 0x2)  /* unit 1 enabled */
-      ind |= R200_TEX0_BIT|R200_TEX1_BIT;
-   else if (ctx->Texture._EnabledUnits & 0x1)  /* unit 1 enabled */
-      ind |= R200_TEX0_BIT;
 
-   r200SetVertexFormat( ctx, ind );
+   r200SetVertexFormat( ctx );
 }
 
 
-/* Flush vertices in the current dma region.
+/**
+ * Set vertex state for SW TCL.  The primary purpose of this function is to
+ * determine in advance whether or not the hardware can / should do the
+ * projection divide or Mesa should do it.
  */
-static void flush_last_swtcl_prim( r200ContextPtr rmesa  )
+void r200ChooseVertexState( GLcontext *ctx )
 {
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   rmesa->dma.flush = 0;
-
-   if (rmesa->dma.current.buf) {
-      struct r200_dma_region *current = &rmesa->dma.current;
-      GLuint current_offset = (rmesa->r200Screen->gart_buffer_offset +
-                              current->buf->buf->idx * RADEON_BUFFER_SIZE + 
-                              current->start);
-
-      assert (!(rmesa->swtcl.hw_primitive & R200_VF_PRIM_WALK_IND));
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint vte;
+   GLuint vap;
 
-      assert (current->start + 
-             rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-             current->ptr);
+   /* We must ensure that we don't do _tnl_need_projected_coords while in a
+    * rasterization fallback.  As this function will be called again when we
+    * leave a rasterization fallback, we can just skip it for now.
+    */
+   if (rmesa->radeon.Fallback != 0)
+      return;
 
-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-        r200EmitVertexAOS( rmesa,
-                             rmesa->swtcl.vertex_size,
-                             current_offset);
+   vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
+   vap = rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL];
 
-        r200EmitVbufPrim( rmesa,
-                          rmesa->swtcl.hw_primitive,
-                          rmesa->swtcl.numverts);
+   /* HW perspective divide is a win, but tiny vertex formats are a
+    * bigger one.
+    */
+   if (!RENDERINPUTS_TEST_RANGE( tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )
+       || (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
+      rmesa->swtcl.needproj = GL_TRUE;
+      vte |= R200_VTX_XY_FMT | R200_VTX_Z_FMT;
+      vte &= ~R200_VTX_W0_FMT;
+      if (RENDERINPUTS_TEST_RANGE( tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+        vap &= ~R200_VAP_FORCE_W_TO_ONE;
+      }
+      else {
+        vap |= R200_VAP_FORCE_W_TO_ONE;
       }
-
-      rmesa->swtcl.numverts = 0;
-      current->start = current->ptr;
    }
-}
-
-
-/* Alloc space in the current dma region.
- */
-static __inline void *r200AllocDmaLowVerts( r200ContextPtr rmesa,
-                                             int nverts, int vsize )
-{
-   GLuint bytes = vsize * nverts;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      r200RefillCurrentDmaRegion( rmesa );
-
-   if (!rmesa->dma.flush) {
-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-      rmesa->dma.flush = flush_last_swtcl_prim;
+   else {
+      rmesa->swtcl.needproj = GL_FALSE;
+      vte &= ~(R200_VTX_XY_FMT | R200_VTX_Z_FMT);
+      vte |= R200_VTX_W0_FMT;
+      vap &= ~R200_VAP_FORCE_W_TO_ONE;
    }
 
-   ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
-   ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
-   ASSERT( rmesa->dma.current.start + 
-          rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-          rmesa->dma.current.ptr );
+   _tnl_need_projected_coords( ctx, rmesa->swtcl.needproj );
 
-
-   {
-      GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
-      rmesa->dma.current.ptr += bytes;
-      rmesa->swtcl.numverts += nverts;
-      return head;
+   if (vte != rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL]) {
+      R200_STATECHANGE( rmesa, vte );
+      rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] = vte;
    }
 
+   if (vap != rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL]) {
+      R200_STATECHANGE( rmesa, vap );
+      rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] = vap;
+   }
 }
 
-
-
-
-void r200_emit_contiguous_verts( GLcontext *ctx, GLuint start, GLuint count )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint vertex_size = rmesa->swtcl.vertex_size * 4;
-   CARD32 *dest = r200AllocDmaLowVerts( rmesa, count-start, vertex_size );
-   setup_tab[rmesa->swtcl.SetupIndex].emit( ctx, start, count, dest, 
-                                           vertex_size );
-}
-
-
-
-void r200_emit_indexed_verts( GLcontext *ctx, GLuint start, GLuint count )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   r200AllocDmaRegionVerts( rmesa, 
-                             &rmesa->swtcl.indexed_verts, 
-                             count - start,
-                             rmesa->swtcl.vertex_size * 4, 
-                             64);
-
-   setup_tab[rmesa->swtcl.SetupIndex].emit( 
-      ctx, start, count, 
-      rmesa->swtcl.indexed_verts.address + rmesa->swtcl.indexed_verts.start, 
-      rmesa->swtcl.vertex_size * 4 );
-}
-
-
-/*
- * Render unclipped vertex buffers by emitting vertices directly to
- * dma buffers.  Use strip/fan hardware primitives where possible.
- * Try to simulate missing primitives with indexed vertices.
- */
-#define HAVE_POINTS      1
-#define HAVE_LINES       1
-#define HAVE_LINE_STRIPS 1
-#define HAVE_TRIANGLES   1
-#define HAVE_TRI_STRIPS  1
-#define HAVE_TRI_STRIP_1 0
-#define HAVE_TRI_FANS    1
-#define HAVE_QUADS       1
-#define HAVE_QUAD_STRIPS 1
-#define HAVE_POLYGONS    1
-#define HAVE_ELTS        1
-
-static const GLuint hw_prim[GL_POLYGON+1] = {
-   R200_VF_PRIM_POINTS,
-   R200_VF_PRIM_LINES,
-   0,
-   R200_VF_PRIM_LINE_STRIP,
-   R200_VF_PRIM_TRIANGLES,
-   R200_VF_PRIM_TRIANGLE_STRIP,
-   R200_VF_PRIM_TRIANGLE_FAN,
-   R200_VF_PRIM_QUADS,
-   R200_VF_PRIM_QUAD_STRIP,
-   R200_VF_PRIM_POLYGON
-};
-
-static __inline void r200DmaPrimitive( r200ContextPtr rmesa, GLenum prim )
-{
-   R200_NEWPRIM( rmesa );
-   rmesa->swtcl.hw_primitive = hw_prim[prim];
-   assert(rmesa->dma.current.ptr == rmesa->dma.current.start);
-}
-
-static __inline void r200EltPrimitive( r200ContextPtr rmesa, GLenum prim )
-{
-   R200_NEWPRIM( rmesa );
-   rmesa->swtcl.hw_primitive = hw_prim[prim] | R200_VF_PRIM_WALK_IND;
-}
-
-
-static void VERT_FALLBACK( GLcontext *ctx,
-                          GLuint start,
-                          GLuint count,
-                          GLuint flags )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   tnl->Driver.Render.PrimitiveNotify( ctx, flags & PRIM_MODE_MASK );
-   tnl->Driver.Render.BuildVertices( ctx, start, count, ~0 );
-   tnl->Driver.Render.PrimTabVerts[flags&PRIM_MODE_MASK]( ctx, start, count, flags );
-   R200_CONTEXT(ctx)->swtcl.SetupNewInputs = _TNL_BIT_POS;
-}
-
-static void ELT_FALLBACK( GLcontext *ctx,
-                         GLuint start,
-                         GLuint count,
-                         GLuint flags )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   tnl->Driver.Render.PrimitiveNotify( ctx, flags & PRIM_MODE_MASK );
-   tnl->Driver.Render.BuildVertices( ctx, start, count, ~0 );
-   tnl->Driver.Render.PrimTabElts[flags&PRIM_MODE_MASK]( ctx, start, count, flags );
-   R200_CONTEXT(ctx)->swtcl.SetupNewInputs = _TNL_BIT_POS;
-}
-
-
-#define LOCAL_VARS r200ContextPtr rmesa = R200_CONTEXT(ctx)
-#define ELTS_VARS  GLushort *dest
-#define INIT( prim ) r200DmaPrimitive( rmesa, prim )
-#define ELT_INIT(prim) r200EltPrimitive( rmesa, prim )
-#define NEW_PRIMITIVE()  R200_NEWPRIM( rmesa )
-#define NEW_BUFFER()  r200RefillCurrentDmaRegion( rmesa )
-#define GET_CURRENT_VB_MAX_VERTS() \
-  (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
-#define GET_SUBSEQUENT_VB_MAX_VERTS() \
-  ((RADEON_BUFFER_SIZE) / (rmesa->swtcl.vertex_size*4))
-
-#define GET_CURRENT_VB_MAX_ELTS() \
-  ((R200_CMD_BUF_SZ - (rmesa->store.cmd_used + 16)) / 2)
-#define GET_SUBSEQUENT_VB_MAX_ELTS() \
-  ((R200_CMD_BUF_SZ - 1024) / 2)
-
-
-
-/* How do you extend an existing primitive?
- */
-#define ALLOC_ELTS(nr)                                                 \
-do {                                                                   \
-   if (rmesa->dma.flush == r200FlushElts &&                            \
-       rmesa->store.cmd_used + nr*2 < R200_CMD_BUF_SZ) {               \
-                                                                       \
-      dest = (GLushort *)(rmesa->store.cmd_buf +                       \
-                         rmesa->store.cmd_used);                       \
-      rmesa->store.cmd_used += nr*2;                                   \
-   }                                                                   \
-   else {                                                              \
-      if (rmesa->dma.flush) {                                          \
-        rmesa->dma.flush( rmesa );                                     \
-      }                                                                        \
-                                                                       \
-      r200EmitVertexAOS( rmesa,                                        \
-                          rmesa->swtcl.vertex_size,                    \
-                          (rmesa->r200Screen->gart_buffer_offset +     \
-                           rmesa->swtcl.indexed_verts.buf->buf->idx *  \
-                           RADEON_BUFFER_SIZE +                        \
-                           rmesa->swtcl.indexed_verts.start));         \
-                                                                       \
-      dest = r200AllocEltsOpenEnded( rmesa,                            \
-                                      rmesa->swtcl.hw_primitive,       \
-                                      nr );                            \
-   }                                                                   \
-} while (0)
-
-#define ALLOC_ELTS_NEW_PRIMITIVE(nr) ALLOC_ELTS( nr )
-
-#ifdef MESA_BIG_ENDIAN
-/* We could do without (most of) this ugliness if dest was always 32 bit word aligned... */
-#define EMIT_ELT(offset, x) do {                                \
-        int off = offset + ( ( (GLuint)dest & 0x2 ) >> 1 );     \
-        GLushort *des = (GLushort *)( (GLuint)dest & ~0x2 );    \
-        (des)[ off + 1 - 2 * ( off & 1 ) ] = (GLushort)(x); } while (0)
-#else
-#define EMIT_ELT(offset, x) (dest)[offset] = (GLushort) (x)
-#endif
-#define EMIT_TWO_ELTS(offset, x, y)  *(GLuint *)(dest+offset) = ((y)<<16)|(x);
-#define INCR_ELTS( nr ) dest += nr
-#define RELEASE_ELT_VERTS() \
-  r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ )
-#define EMIT_VERTS( ctx, j, nr ) \
-  r200_emit_contiguous_verts(ctx, j, (j)+(nr))
-#define EMIT_INDEXED_VERTS( ctx, start, count ) \
-  r200_emit_indexed_verts( ctx, start, count )
-
-
-#define TAG(x) r200_dma_##x
-#include "tnl_dd/t_dd_dmatmp.h"
-
-
-/**********************************************************************/
-/*                          Render pipeline stage                     */
-/**********************************************************************/
-
-
-static GLboolean r200_run_render( GLcontext *ctx,
-                                   struct tnl_pipeline_stage *stage )
+void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   GLuint i, length, flags = 0;
-   render_func *tab = TAG(render_tab_verts);
-
-   if (rmesa->swtcl.indexed_verts.buf && (!VB->Elts || stage->changed_inputs)) 
-      RELEASE_ELT_VERTS();
-       
-   
+   rcommonEnsureCmdBufSpace(&rmesa->radeon,
+                           rmesa->hw.max_state_size + (12*sizeof(int)),
+                           __FUNCTION__);
 
-   if ((R200_DEBUG & DEBUG_VERTS) ||     /* No debug */
-       VB->ClipOrMask ||                /* No clipping */
-       rmesa->swtcl.RenderIndex != 0 ||  /* No per-vertex manipulations */
-       ctx->Line.StippleFlag)            /* No stipple -- fix me? */
-      return GL_TRUE;          
-
-   if (VB->Elts) {
-      tab = TAG(render_tab_elts);
-      if (!rmesa->swtcl.indexed_verts.buf)
-        if (!TAG(emit_elt_verts)(ctx, 0, VB->Count))
-           return GL_TRUE;     /* too many vertices */
-   }
-
-   tnl->Driver.Render.Start( ctx );
-
-   for (i = 0 ; i < VB->PrimitiveCount ; i++)
-   {
-      GLuint prim = VB->Primitive[i].mode;
-      GLuint start = VB->Primitive[i].start;
-      GLuint length = VB->Primitive[i].count;
 
-      if (!length)
-        continue;
+   r200EmitState(rmesa);
+   r200EmitVertexAOS( rmesa,
+                     rmesa->radeon.swtcl.vertex_size,
+                     rmesa->radeon.dma.current,
+                     current_offset);
 
-      if (R200_DEBUG & DEBUG_PRIMS)
-        fprintf(stderr, "r200_render.c: prim %s %d..%d\n", 
-                _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
-                start, start+length);
+                     
+   r200EmitVbufPrim( rmesa,
+                    rmesa->radeon.swtcl.hw_primitive,
+                    rmesa->radeon.swtcl.numverts);
 
-      tab[prim & PRIM_MODE_MASK]( ctx, start, start + length, flags );
-   }
-
-   tnl->Driver.Render.Finish( ctx );
-
-   return GL_FALSE;            /* finished the pipe */
 }
 
+/**************************************************************************/
 
 
-static void r200_check_render( GLcontext *ctx,
-                                struct tnl_pipeline_stage *stage )
+static INLINE GLuint reduced_hw_prim( GLcontext *ctx, GLuint prim)
 {
-   GLuint inputs = _TNL_BIT_POS | _TNL_BIT_COLOR0;
-
-   if (ctx->RenderMode == GL_RENDER) {
-      if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR)
-        inputs |= _TNL_BIT_COLOR1;
-
-      if (ctx->Texture.Unit[0]._ReallyEnabled)
-        inputs |= _TNL_BIT_TEX0;
-
-      if (ctx->Texture.Unit[1]._ReallyEnabled)
-        inputs |= _TNL_BIT_TEX1;
-
-      if (ctx->Fog.Enabled)
-        inputs |= _TNL_BIT_FOG;
+   switch (prim) {
+   case GL_POINTS:
+      return (ctx->Point.PointSprite ||
+        ((ctx->_TriangleCaps & (DD_POINT_SIZE | DD_POINT_ATTEN)) &&
+        !(ctx->_TriangleCaps & (DD_POINT_SMOOTH)))) ?
+        R200_VF_PRIM_POINT_SPRITES : R200_VF_PRIM_POINTS;
+   case GL_LINES:
+   /* fallthrough */
+   case GL_LINE_LOOP:
+   /* fallthrough */
+   case GL_LINE_STRIP:
+      return R200_VF_PRIM_LINES;
+   default:
+   /* all others reduced to triangles */
+      return R200_VF_PRIM_TRIANGLES;
    }
-
-   stage->inputs = inputs;
-}
-
-
-static void dtr( struct tnl_pipeline_stage *stage )
-{
-   (void)stage;
 }
 
 
-const struct tnl_pipeline_stage _r200_render_stage =
-{
-   "r200 render",
-   (_DD_NEW_SEPARATE_SPECULAR |
-    _NEW_TEXTURE|
-    _NEW_FOG|
-    _NEW_RENDERMODE),          /* re-check (new inputs) */
-   0,                          /* re-run (always runs) */
-   GL_TRUE,                    /* active */
-   0, 0,                       /* inputs (set in check_render), outputs */
-   0, 0,                       /* changed_inputs, private */
-   dtr,                                /* destructor */
-   r200_check_render,          /* check - initially set to alloc data */
-   r200_run_render             /* run */
-};
-
-
-
-/**************************************************************************/
-
-
-static const GLuint reduced_hw_prim[GL_POLYGON+1] = {
-   R200_VF_PRIM_POINTS,
-   R200_VF_PRIM_LINES,
-   R200_VF_PRIM_LINES,
-   R200_VF_PRIM_LINES,
-   R200_VF_PRIM_TRIANGLES,
-   R200_VF_PRIM_TRIANGLES,
-   R200_VF_PRIM_TRIANGLES,
-   R200_VF_PRIM_TRIANGLES,
-   R200_VF_PRIM_TRIANGLES,
-   R200_VF_PRIM_TRIANGLES
-};
-
 static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim );
 static void r200RenderPrimitive( GLcontext *ctx, GLenum prim );
 static void r200ResetLineStipple( GLcontext *ctx );
 
-#undef HAVE_QUADS
-#define HAVE_QUADS 0
-
-#undef HAVE_QUAD_STRIPS
-#define HAVE_QUAD_STRIPS 0
-
 /***********************************************************************
  *                    Emit primitives as inline vertices               *
  ***********************************************************************/
 
+#define HAVE_POINTS      1
+#define HAVE_LINES       1
+#define HAVE_LINE_STRIPS 1
+#define HAVE_TRIANGLES   1
+#define HAVE_TRI_STRIPS  1
+#define HAVE_TRI_STRIP_1 0
+#define HAVE_TRI_FANS    1
+#define HAVE_QUADS       0
+#define HAVE_QUAD_STRIPS 0
+#define HAVE_POLYGONS    1
+#define HAVE_ELTS        0
+
 #undef LOCAL_VARS
+#undef ALLOC_VERTS
 #define CTX_ARG r200ContextPtr rmesa
-#define CTX_ARG2 rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) r200AllocDmaLowVerts( rmesa, n, size * 4 )
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
 #define LOCAL_VARS                                             \
    r200ContextPtr rmesa = R200_CONTEXT(ctx);           \
-   const GLuint shift = rmesa->swtcl.vertex_stride_shift;      \
-   const char *r200verts = (char *)rmesa->swtcl.verts;
-#define VERT(x) (r200Vertex *)(r200verts + (x << shift))
-#define VERTEX r200Vertex 
+   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;
+#define VERT(x) (radeonVertex *)(r200verts + ((x) * vertsize * sizeof(int)))
+#define VERTEX radeonVertex 
 #define DO_DEBUG_VERTS (1 && (R200_DEBUG & DEBUG_VERTS))
-#define PRINT_VERTEX(v) r200_print_vertex(rmesa->glCtx, v)
+
 #undef TAG
 #define TAG(x) r200_##x
 #include "tnl_dd/t_dd_triemit.h"
@@ -796,10 +368,10 @@ static void r200ResetLineStipple( GLcontext *ctx );
 
 
 static struct {
-   points_func         points;
-   line_func           line;
-   triangle_func       triangle;
-   quad_func           quad;
+   tnl_points_func             points;
+   tnl_line_func               line;
+   tnl_triangle_func   triangle;
+   tnl_quad_func               quad;
 } rast_tab[R200_MAX_TRIFUNC];
 
 
@@ -816,7 +388,6 @@ static struct {
 
 #define HAVE_RGBA   1
 #define HAVE_SPEC   1
-#define HAVE_INDEX  0
 #define HAVE_BACK_COLORS  0
 #define HAVE_HW_FLATSHADE 1
 #define TAB rast_tab
@@ -828,23 +399,46 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e<<rmesa->swtcl.vertex_stride_shift))
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
+
+#define VERT_SET_RGBA( v, c )                                          \
+do {                                                           \
+   radeon_color_t *color = (radeon_color_t *)&((v)->ui[coloroffset]);  \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);               \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);             \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);              \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]);             \
+} while (0)
 
-#define VERT_SET_RGBA( v, c )    v->ui[coloroffset] = LE32_TO_CPU(*(GLuint *)c)
 #define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
-#define VERT_SAVE_RGBA( idx )    color[idx] = CPU_TO_LE32(v[idx]->ui[coloroffset])
-#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = LE32_TO_CPU(color[idx])
-
-#define VERT_SET_SPEC( v0, c )   if (havespec) {                       \
-                                       v0->v.specular.red   = (c)[0];  \
-                                       v0->v.specular.green = (c)[1];  \
-                                       v0->v.specular.blue  = (c)[2]; }
-#define VERT_COPY_SPEC( v0, v1 ) if (havespec) {                                       \
-                                       v0->v.specular.red   = v1->v.specular.red;      \
-                                       v0->v.specular.green = v1->v.specular.green;    \
-                                       v0->v.specular.blue  = v1->v.specular.blue; }
-#define VERT_SAVE_SPEC( idx )    if (havespec) spec[idx] = CPU_TO_LE32(v[idx]->ui[5])
-#define VERT_RESTORE_SPEC( idx ) if (havespec) v[idx]->ui[5] = LE32_TO_CPU(spec[idx])
+
+#define VERT_SET_SPEC( v, c )                                  \
+do {                                                           \
+   if (specoffset) {                                           \
+      radeon_color_t *spec = (radeon_color_t *)&((v)->ui[specoffset]); \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);     \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);   \
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);    \
+   }                                                           \
+} while (0)
+#define VERT_COPY_SPEC( v0, v1 )                       \
+do {                                                   \
+   if (specoffset) {                                   \
+      radeon_color_t *spec0 = (radeon_color_t *)&((v0)->ui[specoffset]);       \
+      radeon_color_t *spec1 = (radeon_color_t *)&((v1)->ui[specoffset]);       \
+      spec0->red   = spec1->red;       \
+      spec0->green = spec1->green;     \
+      spec0->blue  = spec1->blue;      \
+   }                                                   \
+} while (0)
+
+/* These don't need LE32_TO_CPU() as they used to save and restore
+ * colors which are already in the correct format.
+ */
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
+#define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
 
 #undef LOCAL_VARS
 #undef TAG
@@ -853,16 +447,16 @@ static struct {
 #define LOCAL_VARS(n)                                                  \
    r200ContextPtr rmesa = R200_CONTEXT(ctx);                   \
    GLuint color[n], spec[n];                                           \
-   GLuint coloroffset = (rmesa->swtcl.vertex_size == 4 ? 3 : 4);       \
-   GLboolean havespec = (rmesa->swtcl.vertex_size > 4);                        \
-   (void) color; (void) spec; (void) coloroffset; (void) havespec;
+   GLuint coloroffset = rmesa->swtcl.coloroffset;      \
+   GLuint specoffset = rmesa->swtcl.specoffset;                        \
+   (void) color; (void) spec; (void) coloroffset; (void) specoffset;
 
 /***********************************************************************
  *                Helpers for rendering unfilled primitives            *
  ***********************************************************************/
 
-#define RASTERIZE(x) r200RasterPrimitive( ctx, reduced_hw_prim[x] )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RASTERIZE(x) r200RasterPrimitive( ctx, reduced_hw_prim(ctx, x) )
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -903,7 +497,6 @@ static void init_rast_tab( void )
 /*               Render unclipped begin/end objects                   */
 /**********************************************************************/
 
-#define VERT(x) (r200Vertex *)(r200verts + (x << shift))
 #define RENDER_POINTS( start, count )          \
    for ( ; start < count ; start++)            \
       r200_point( rmesa, VERT(start) )
@@ -919,8 +512,8 @@ static void init_rast_tab( void )
 #undef LOCAL_VARS
 #define LOCAL_VARS                                             \
    r200ContextPtr rmesa = R200_CONTEXT(ctx);           \
-   const GLuint shift = rmesa->swtcl.vertex_stride_shift;              \
-   const char *r200verts = (char *)rmesa->swtcl.verts;         \
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;            \
+   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;          \
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;       \
    const GLboolean stipple = ctx->Line.StippleFlag;            \
    (void) elt; (void) stipple;
@@ -949,13 +542,13 @@ void r200ChooseRenderState( GLcontext *ctx )
    GLuint index = 0;
    GLuint flags = ctx->_TriangleCaps;
 
-   if (!rmesa->TclFallback || rmesa->Fallback) 
+   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback) 
       return;
 
    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R200_TWOSIDE_BIT;
    if (flags & DD_TRI_UNFILLED)      index |= R200_UNFILLED_BIT;
 
-   if (index != rmesa->swtcl.RenderIndex) {
+   if (index != rmesa->radeon.swtcl.RenderIndex) {
       tnl->Driver.Render.Points = rast_tab[index].points;
       tnl->Driver.Render.Line = rast_tab[index].line;
       tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -972,7 +565,7 @@ void r200ChooseRenderState( GLcontext *ctx )
         tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
       }
 
-      rmesa->swtcl.RenderIndex = index;
+      rmesa->radeon.swtcl.RenderIndex = index;
    }
 }
 
@@ -986,18 +579,29 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   if (rmesa->swtcl.hw_primitive != hwprim) {
+   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+      /* need to disable perspective-correct texturing for point sprites */
+      if ((hwprim & 0xf) == R200_VF_PRIM_POINT_SPRITES && ctx->Point.PointSprite) {
+        if (rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE) {
+           R200_STATECHANGE( rmesa, set );
+           rmesa->hw.set.cmd[SET_RE_CNTL] &= ~R200_PERSPECTIVE_ENABLE;
+        }
+      }
+      else if (!(rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE)) {
+        R200_STATECHANGE( rmesa, set );
+        rmesa->hw.set.cmd[SET_RE_CNTL] |= R200_PERSPECTIVE_ENABLE;
+      }
       R200_NEWPRIM( rmesa );
-      rmesa->swtcl.hw_primitive = hwprim;
+      rmesa->radeon.swtcl.hw_primitive = hwprim;
    }
 }
 
 static void r200RenderPrimitive( GLcontext *ctx, GLenum prim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   rmesa->swtcl.render_primitive = prim;
+   rmesa->radeon.swtcl.render_primitive = prim;
    if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
-      r200RasterPrimitive( ctx, reduced_hw_prim[prim] );
+      r200RasterPrimitive( ctx, reduced_hw_prim(ctx, prim) );
 }
 
 static void r200RenderFinish( GLcontext *ctx )
@@ -1020,8 +624,6 @@ static const char * const fallbackStrings[] = {
    "glDrawBuffer(GL_FRONT_AND_BACK)",
    "glEnable(GL_STENCIL) without hw stencil buffer",
    "glRenderMode(selection or feedback)",
-   "glBlendEquation",
-   "glBlendFunc(mode != ADD)",
    "R200_NO_RAST",
    "Mixing GL_CLAMP_TO_BORDER and GL_CLAMP (or GL_MIRROR_CLAMP_ATI)"
 };
@@ -1042,16 +644,15 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->Fallback;
+   GLuint oldfallback = rmesa->radeon.Fallback;
 
    if (mode) {
-      rmesa->Fallback |= bit;
+      rmesa->radeon.Fallback |= bit;
       if (oldfallback == 0) {
         R200_FIREVERTICES( rmesa );
         TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_TRUE );
         _swsetup_Wakeup( ctx );
-        _tnl_need_projected_coords( ctx, GL_TRUE );
-        rmesa->swtcl.RenderIndex = ~0;
+        rmesa->radeon.swtcl.RenderIndex = ~0;
          if (R200_DEBUG & DEBUG_FALLBACKS) {
             fprintf(stderr, "R200 begin rasterization fallback: 0x%x %s\n",
                     bit, getFallbackString(bit));
@@ -1059,20 +660,28 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
       }
    }
    else {
-      rmesa->Fallback &= ~bit;
+      rmesa->radeon.Fallback &= ~bit;
       if (oldfallback == bit) {
+
         _swrast_flush( ctx );
         tnl->Driver.Render.Start = r200RenderStart;
         tnl->Driver.Render.PrimitiveNotify = r200RenderPrimitive;
         tnl->Driver.Render.Finish = r200RenderFinish;
-        tnl->Driver.Render.BuildVertices = r200BuildVertices;
+
+        tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+        tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+        tnl->Driver.Render.Interp = _tnl_interp;
+
         tnl->Driver.Render.ResetLineStipple = r200ResetLineStipple;
         TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_FALSE );
-        if (rmesa->TclFallback) {
-           /* These are already done if rmesa->TclFallback goes to
+        if (rmesa->radeon.TclFallback) {
+           /* These are already done if rmesa->radeon.TclFallback goes to
             * zero above. But not if it doesn't (R200_NO_TCL for
             * example?)
             */
+           _tnl_invalidate_vertex_state( ctx, ~0 );
+           _tnl_invalidate_vertices( ctx, ~0 );
+           RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
            r200ChooseVertexState( ctx );
            r200ChooseRenderState( ctx );
         }
@@ -1087,7 +696,15 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 
 
-/* Cope with depth operations by drawing individual pixels as points??? 
+/**
+ * Cope with depth operations by drawing individual pixels as points.
+ * 
+ * \todo
+ * The way the vertex state is set in this routine is hokey.  It seems to
+ * work, but it's very hackish.  This whole routine is pretty hackish.  If
+ * the bitmap is small enough, it seems like it would be faster to copy it
+ * to AGP memory and use it as a non-power-of-two texture (i.e.,
+ * NV_texture_rectangle).
  */
 void
 r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
@@ -1098,7 +715,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    const GLfloat *rc = ctx->Current.RasterColor; 
    GLint row, col;
-   r200Vertex vert;
+   radeonVertex vert;
    GLuint orig_vte;
    GLuint h;
 
@@ -1109,7 +726,37 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 
    /* Choose tiny vertex format
     */
-   r200SetVertexFormat( ctx, R200_XYZW_BIT | R200_RGBA_BIT );
+   {
+      const GLuint fmt_0 = R200_VTX_XY | R200_VTX_Z0 | R200_VTX_W0
+         | (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT);
+      const GLuint fmt_1 = 0;
+      GLuint vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
+      GLuint vap = rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL];
+
+      vte &= ~(R200_VTX_XY_FMT | R200_VTX_Z_FMT);
+      vte |= R200_VTX_W0_FMT;
+      vap &= ~R200_VAP_FORCE_W_TO_ONE;
+
+      rmesa->radeon.swtcl.vertex_size = 5;
+
+      if ( (rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0)
+          || (rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
+        R200_NEWPRIM(rmesa);
+        R200_STATECHANGE( rmesa, vtx );
+        rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = fmt_0;
+        rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = fmt_1;
+      }
+
+      if (vte != rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL]) {
+        R200_STATECHANGE( rmesa, vte );
+        rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] = vte;
+      }
+
+      if (vap != rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL]) {
+        R200_STATECHANGE( rmesa, vap );
+        rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] = vap;
+      }
+   }
 
    /* Ready for point primitives:
     */
@@ -1167,17 +814,17 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 
    /* Update window height
     */
-   LOCK_HARDWARE( rmesa );
-   UNLOCK_HARDWARE( rmesa );
-   h = rmesa->dri.drawable->h + rmesa->dri.drawable->y;
-   px += rmesa->dri.drawable->x;
+   LOCK_HARDWARE( &rmesa->radeon );
+   UNLOCK_HARDWARE( &rmesa->radeon );
+   h = rmesa->radeon.dri.drawable->h + rmesa->radeon.dri.drawable->y;
+   px += rmesa->radeon.dri.drawable->x;
 
    /* Clipping handled by existing mechansims in r200_ioctl.c?
     */
    for (row=0; row<height; row++) {
       const GLubyte *src = (const GLubyte *) 
-        _mesa_image_addressunpack, bitmap, width, height, 
-                             GL_COLOR_INDEX, GL_BITMAP, 0, row, 0 );
+        _mesa_image_address2d(unpack, bitmap, width, height, 
+                               GL_COLOR_INDEX, GL_BITMAP, row, 0 );
 
       if (unpack->LsbFirst) {
          /* Lsb first */
@@ -1225,18 +872,11 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 
    /* Need to restore vertexformat?
     */
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       r200ChooseVertexState( ctx );
 }
 
 
-void r200FlushVertices( GLcontext *ctx, GLuint flags )
-{
-   _tnl_FlushVertices( ctx, flags );
-
-   if (flags & FLUSH_STORED_VERTICES)
-      R200_NEWPRIM( R200_CONTEXT( ctx ) );
-}
 
 /**********************************************************************/
 /*                            Initialization.                         */
@@ -1246,12 +886,10 @@ void r200InitSwtcl( GLcontext *ctx )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint size = TNL_CONTEXT(ctx)->vb.Size;
    static int firsttime = 1;
 
    if (firsttime) {
       init_rast_tab();
-      init_setup_tab();
       firsttime = 0;
    }
 
@@ -1259,12 +897,18 @@ void r200InitSwtcl( GLcontext *ctx )
    tnl->Driver.Render.Finish = r200RenderFinish;
    tnl->Driver.Render.PrimitiveNotify = r200RenderPrimitive;
    tnl->Driver.Render.ResetLineStipple = r200ResetLineStipple;
-   tnl->Driver.Render.BuildVertices = r200BuildVertices;
+   tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+   tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+   tnl->Driver.Render.Interp = _tnl_interp;
 
-   rmesa->swtcl.verts = (GLubyte *)ALIGN_MALLOC( size * 16 * 4, 32 );
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
-   rmesa->swtcl.hw_primitive = 0;
+   /* FIXME: what are these numbers? */
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+                      36 * sizeof(GLfloat) );
+   
+   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->radeon.swtcl.hw_primitive = 0;
 }
 
 
@@ -1272,21 +916,4 @@ void r200DestroySwtcl( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   if (rmesa->swtcl.indexed_verts.buf) 
-      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ );
-
-   if (rmesa->swtcl.verts) {
-      ALIGN_FREE(rmesa->swtcl.verts);
-      rmesa->swtcl.verts = 0;
-   }
-
-   if (rmesa->UbyteSecondaryColor.Ptr) {
-      ALIGN_FREE(rmesa->UbyteSecondaryColor.Ptr);
-      rmesa->UbyteSecondaryColor.Ptr = 0;
-   }
-
-   if (rmesa->UbyteColor.Ptr) {
-      ALIGN_FREE(rmesa->UbyteColor.Ptr);
-      rmesa->UbyteColor.Ptr = 0;
-   }
 }