r5xx: Dump shader constants when dumping program assembly.
[mesa.git] / src / mesa / drivers / dri / r300 / r300_swtcl.c
index f724a8e6f855b8e4ce1c95da6e639a10faacf11a..8aebd9be3ea0b9f434576702c7d2609c23b280f6 100644 (file)
@@ -40,6 +40,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "enums.h"
 #include "image.h"
 #include "imports.h"
+#include "light.h"
 #include "macros.h"
 
 #include "swrast/s_context.h"
@@ -59,14 +60,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 static void flush_last_swtcl_prim( r300ContextPtr rmesa  );
 
+
 void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset);
 void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr);
-#define EMIT_ATTR( ATTR, STYLE, F0 )                                   \
+#define EMIT_ATTR( ATTR, STYLE )                                       \
 do {                                                                   \
    rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);  \
    rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE); \
    rmesa->swtcl.vertex_attr_count++;                                   \
-   vap_fmt_0 |= F0;                                                            \
 } while (0)
 
 #define EMIT_PAD( N )                                                  \
@@ -79,217 +80,251 @@ do {                                                                      \
 
 static void r300SetVertexFormat( GLcontext *ctx )
 {
-   r300ContextPtr rmesa = R300_CONTEXT( ctx );
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   DECLARE_RENDERINPUTS(index_bitset);
-   int vap_fmt_0 = 0;
-   int vap_fmt_1 = 0;
-   int vap_vte_cntl = 0;
-   int offset = 0;
-
-   RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
-
-   /* Important:
-    */
-   if ( VB->NdcPtr != NULL ) {
-      VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
-   }
-   else {
-      VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
-   }
-
-   assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-   rmesa->swtcl.vertex_attr_count = 0;
-
-   /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
-    * build up a hardware vertex.
-    */
-   if ( !rmesa->swtcl.needproj ||
-       RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) { /* need w coord for projected textures */
-      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F, R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT);
-      vap_vte_cntl |= R300_VTX_XY_FMT | R300_VTX_Z_FMT | R300_VTX_W0_FMT;
-
-      offset = 4;
-   }
-   else {
-      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_3F, R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT );
-      vap_vte_cntl |= R300_VTX_XY_FMT | R300_VTX_Z_FMT;
-      offset = 3;
-   }
-
-   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POINTSIZE )) {
-      EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F, R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT);
-      offset += 1;
-   }
-
-   rmesa->swtcl.coloroffset = offset;
-#if MESA_LITTLE_ENDIAN 
-   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT );
-#else
-   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT );
-#endif
-   offset += 1;
-
-   rmesa->swtcl.specoffset = 0;
-   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
-       RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
-
-#if MESA_LITTLE_ENDIAN 
-      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
-        rmesa->swtcl.specoffset = offset;
-        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB, R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT );
-      }
-      else {
-        EMIT_PAD( 3 );
-      }
-
-      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
-        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F, R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT );
-      }
-      else {
-        EMIT_PAD( 1 );
-      }
-#else
-      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
-        EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F, R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT );
-      }
-      else {
-        EMIT_PAD( 1 );
-      }
-
-      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
-        rmesa->swtcl.specoffset = offset;
-        EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_BGR, R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT );
-      }
-      else {
-        EMIT_PAD( 3 );
-      }
-#endif
-   }
-
-   if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
-      int i;
-
-      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-        if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-           GLuint sz = VB->TexCoordPtr[i]->size;
-
-           vap_fmt_1 |= sz << (3 * i);
-           EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1, 0 );
-        }
-      }
-   }
-
-#if 0
-   if ( (rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] & R200_FOG_USE_MASK)
-      != R200_FOG_USE_SPEC_ALPHA ) {
-      R200_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_USE_MASK;
-      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= R200_FOG_USE_SPEC_ALPHA;
-   }
-#endif
-
-   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
-       (rmesa->hw.vof.cmd[R300_VOF_CNTL_0] != vap_fmt_0) ||
-       (rmesa->hw.vof.cmd[R300_VOF_CNTL_1] != vap_fmt_1) ) {
-//      R200_NEWPRIM(rmesa);
-      R300_STATECHANGE(rmesa, vof);
-      rmesa->hw.vof.cmd[R300_VOF_CNTL_0] =
-             vap_fmt_0;
-      rmesa->hw.vof.cmd[R300_VOF_CNTL_1] =
-             vap_fmt_1;
-
-      rmesa->swtcl.vertex_size =
-         _tnl_install_attrs( ctx,
-                             rmesa->swtcl.vertex_attrs, 
-                             rmesa->swtcl.vertex_attr_count,
-                             NULL, 0 );
-      rmesa->swtcl.vertex_size /= 4;
-      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
-   }
-}
-
+       r300ContextPtr rmesa = R300_CONTEXT( ctx );
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *VB = &tnl->vb;
+       DECLARE_RENDERINPUTS(index_bitset);
+       GLuint InputsRead = 0, OutputsWritten = 0;
+       int vap_fmt_0 = 0;
+       int vap_vte_cntl = 0;
+       int offset = 0;
+       int vte = 0;
+       GLint inputs[VERT_ATTRIB_MAX];
+       GLint tab[VERT_ATTRIB_MAX];
+       int swizzle[VERT_ATTRIB_MAX][4];
+       GLuint i, nr;
+       GLuint sz, vap_fmt_1 = 0;
+
+       DECLARE_RENDERINPUTS(render_inputs_bitset);
+       RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
+       RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
+       RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
+
+       vte = rmesa->hw.vte.cmd[1];
+       vte &= ~(R300_VTX_XY_FMT | R300_VTX_Z_FMT | R300_VTX_W0_FMT);
+       /* Important:
+        */
+       if ( VB->NdcPtr != NULL ) {
+               VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
+               vte |= R300_VTX_XY_FMT | R300_VTX_Z_FMT;
+       }
+       else {
+               VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+               vte |= R300_VTX_W0_FMT;
+       }
 
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+       assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+       rmesa->swtcl.vertex_attr_count = 0;
+
+       /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+        * build up a hardware vertex.
+        */
+       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POS)) {
+               sz = VB->AttribPtr[VERT_ATTRIB_POS]->size;
+               InputsRead |= 1 << VERT_ATTRIB_POS;
+               OutputsWritten |= 1 << VERT_RESULT_HPOS;
+               EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_1F + sz - 1 );
+               offset = sz;
+       } else {
+               offset = 4;
+               EMIT_PAD(4 * sizeof(float));
+       }
 
-   rmesa->dma.flush = NULL;
+       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POINTSIZE )) {
+               EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
+               vap_fmt_0 |=  R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+               offset += 1;
+       }
 
-   if (rmesa->dma.current.buf) {
-     struct r300_dma_region *current = &rmesa->dma.current;
-     //     GLuint current_offset = rmesa->state.swtcl_dma.aos_offset;
+       if (RENDERINPUTS_TEST(index_bitset, _TNL_ATTRIB_COLOR0)) {
+               sz = VB->AttribPtr[VERT_ATTRIB_COLOR0]->size;
+               rmesa->swtcl.coloroffset = offset;
+               InputsRead |= 1 << VERT_ATTRIB_COLOR0;
+               OutputsWritten |= 1 << VERT_RESULT_COL0;
+               EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_1F + sz - 1 );
+               offset += sz;
+       }
 
-     GLuint current_offset = r300_mem_offset(rmesa,current->buf->id);
-     //      assert (!(rmesa->swtcl.hw_primitive & R200_VF_PRIM_WALK_IND));
+       rmesa->swtcl.specoffset = 0;
+       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+               sz = VB->AttribPtr[VERT_ATTRIB_COLOR1]->size;
+               rmesa->swtcl.specoffset = offset;
+               EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_1F + sz - 1 );
+               InputsRead |= 1 << VERT_ATTRIB_COLOR1;
+               OutputsWritten |= 1 << VERT_RESULT_COL1;
+       }
 
-      assert (current->start + 
-             rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-             current->ptr);
+       if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+               int i;
+
+               for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+                       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
+                               sz = VB->TexCoordPtr[i]->size;
+                               InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
+                               OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
+                               EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1 );
+                               vap_fmt_1 |= sz << (3 * i);
+                       }
+               }
+       }
 
-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-       //       r200EnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
-       //                              rmesa->hw.max_state_size + VBUF_BUFSZ );
-       r300EmitVertexAOS( rmesa,
-                          rmesa->swtcl.vertex_size,
-                          current_offset);
+       for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
+               if (InputsRead & (1 << i)) {
+                       inputs[i] = nr++;
+               } else {
+                       inputs[i] = -1;
+               }
+       }
        
-       r300EmitVbufPrim( rmesa,
-                         rmesa->swtcl.hw_primitive,
-                         rmesa->swtcl.numverts);
-      }
-
-      rmesa->swtcl.numverts = 0;
-      current->start = current->ptr;
-   }
-}
+       /* Fixed, apply to vir0 only */
+       if (InputsRead & (1 << VERT_ATTRIB_POS))
+               inputs[VERT_ATTRIB_POS] = 0;
+       if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
+               inputs[VERT_ATTRIB_COLOR0] = 2;
+       if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
+               inputs[VERT_ATTRIB_COLOR1] = 3;
+       for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
+               if (InputsRead & (1 << i))
+                       inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
+       
+       for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
+               if (InputsRead & (1 << i)) {
+                       tab[nr++] = i;
+               }
+       }
+       
+       for (i = 0; i < nr; i++) {
+               int ci;
+               
+               swizzle[i][0] = SWIZZLE_ZERO;
+               swizzle[i][1] = SWIZZLE_ZERO;
+               swizzle[i][2] = SWIZZLE_ZERO;
+               swizzle[i][3] = SWIZZLE_ONE;
+
+               for (ci = 0; ci < VB->AttribPtr[tab[i]]->size; ci++) {
+                       swizzle[i][ci] = ci;
+               }
+       }
 
+       R300_NEWPRIM(rmesa);
+       R300_STATECHANGE(rmesa, vir[0]);
+       ((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
+               r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
+                                  VB->AttribPtr, inputs, tab, nr);
+       R300_STATECHANGE(rmesa, vir[1]);
+       ((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
+               r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
+                                  nr);
+   
+       R300_STATECHANGE(rmesa, vic);
+       rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+       rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+   
+       R300_STATECHANGE(rmesa, vof);
+       rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
+       rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_fmt_1;
+   
+       rmesa->swtcl.vertex_size =
+               _tnl_install_attrs( ctx,
+                                   rmesa->swtcl.vertex_attrs, 
+                                   rmesa->swtcl.vertex_attr_count,
+                                   NULL, 0 );
+       
+       rmesa->swtcl.vertex_size /= 4;
 
-static void *
-r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
-{
-  GLuint bytes = vsize * nverts;
+       RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
 
-  r300AllocDmaRegion(rmesa, &rmesa->state.swtcl_dma, bytes, 0);
 
-  if (!rmesa->dma.flush) {
-    rmesa->dma.flush = flush_last_swtcl_prim;
-    rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-  }
+       R300_STATECHANGE(rmesa, vte);
+       rmesa->hw.vte.cmd[1] = vte;
+       rmesa->hw.vte.cmd[2] = rmesa->swtcl.vertex_size;
+}
 
 
-  rmesa->swtcl.numverts += nverts;
-  rmesa->dma.current.ptr += bytes;
-  return (rmesa->dma.current.address + rmesa->dma.current.ptr);
+/* Flush vertices in the current dma region.
+ */
+static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
+{
+       if (RADEON_DEBUG & DEBUG_IOCTL)
+               fprintf(stderr, "%s\n", __FUNCTION__);
+       
+       rmesa->dma.flush = NULL;
+
+       if (rmesa->dma.current.buf) {
+               struct r300_dma_region *current = &rmesa->dma.current;
+               GLuint current_offset = GET_START(current);
+
+               assert (current->start + 
+                       rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+                       current->ptr);
+
+               if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+
+                       r300EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size + (12*sizeof(int)), __FUNCTION__);
+                       
+                       r300EmitState(rmesa);
+                       
+                       r300EmitVertexAOS( rmesa,
+                                          rmesa->swtcl.vertex_size,
+                                          current_offset);
+                       
+                       r300EmitVbufPrim( rmesa,
+                                         rmesa->swtcl.hw_primitive,
+                                         rmesa->swtcl.numverts);
+                       
+                       r300EmitCacheFlush(rmesa);
+               }
+               
+               rmesa->swtcl.numverts = 0;
+               current->start = current->ptr;
+       }
 }
 
-static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim );
-static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
-//static void r300ResetLineStipple( GLcontext *ctx );
-
-static void r300PrintVertex(r300Vertex *v)
+/* Alloc space in the current dma region.
+ */
+static void *
+r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
 {
-  fprintf(stderr,"Vertex %p\n", v);
+       GLuint bytes = vsize * nverts;
+
+       if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+               r300RefillCurrentDmaRegion( rmesa, bytes);
+
+       if (!rmesa->dma.flush) {
+               rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+               rmesa->dma.flush = flush_last_swtcl_prim;
+       }
 
+       ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
+       ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
+       ASSERT( rmesa->dma.current.start + 
+               rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+               rmesa->dma.current.ptr );
+
+       {
+               GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
+               rmesa->dma.current.ptr += bytes;
+               rmesa->swtcl.numverts += nverts;
+               return head;
+       }
 }
 
-static const GLenum reduced_prim[GL_POLYGON+1] = {
-   GL_POINTS,
-   GL_LINES,
-   GL_LINES,
-   GL_LINES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES,
-   GL_TRIANGLES
+static GLuint reduced_prim[] = {
+  GL_POINTS,
+  GL_LINES,
+  GL_LINES,
+  GL_LINES,
+  GL_TRIANGLES,
+  GL_TRIANGLES,
+  GL_TRIANGLES,
+  GL_TRIANGLES,
+  GL_TRIANGLES,
+  GL_TRIANGLES,
 };
 
+static void r300RasterPrimitive( GLcontext *ctx, GLuint prim );
+static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
+//static void r300ResetLineStipple( GLcontext *ctx );
 
 /***********************************************************************
  *                    Emit primitives as inline vertices               *
@@ -319,7 +354,7 @@ static const GLenum reduced_prim[GL_POLYGON+1] = {
 #define VERT(x) (r300Vertex *)(r300verts + ((x) * vertsize * sizeof(int)))
 #define VERTEX r300Vertex 
 #define DO_DEBUG_VERTS (1 && (RADEON_DEBUG & DEBUG_VERTS))
-#define PRINT_VERTEX(x) r300PrintVertex(x)
+#define PRINT_VERTEX(x)
 #undef TAG
 #define TAG(x) r300_##x
 #include "tnl_dd/t_dd_triemit.h"
@@ -376,42 +411,31 @@ static struct {
 #define AREA_IS_CCW( a ) (a < 0)
 #define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
 
-#define VERT_SET_RGBA( v, c )                                          \
-do {                                                           \
-   r300_color_t *color = (r300_color_t *)&((v)->ui[coloroffset]);      \
-   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);               \
-   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);             \
-   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);              \
-   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]);             \
+/* Only used to pull back colors into vertices (ie, we know color is
+ * floating point).
+ */
+#define R300_COLOR( dst, src )                         \
+do {                                                   \
+   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);       \
+   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);       \
+   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);       \
+   UNCLAMPED_FLOAT_TO_UBYTE((dst)[3], (src)[3]);       \
 } while (0)
 
-#define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
+#define VERT_SET_RGBA( v, c )    if (coloroffset) R300_COLOR( v->ub4[coloroffset], c )
+#define VERT_COPY_RGBA( v0, v1 ) if (coloroffset) v0->ui[coloroffset] = v1->ui[coloroffset]
+#define VERT_SAVE_RGBA( idx )    if (coloroffset) color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) if (coloroffset) v[idx]->ui[coloroffset] = color[idx]
 
-#define VERT_SET_SPEC( v, c )                                  \
-do {                                                           \
-   if (specoffset) {                                           \
-      r300_color_t *spec = (r300_color_t *)&((v)->ui[specoffset]);     \
-      UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);     \
-      UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);   \
-      UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);    \
-   }                                                           \
-} while (0)
-#define VERT_COPY_SPEC( v0, v1 )                       \
+#define R300_SPEC( dst, src )                          \
 do {                                                   \
-   if (specoffset) {                                   \
-      r300_color_t *spec0 = (r300_color_t *)&((v0)->ui[specoffset]);   \
-      r300_color_t *spec1 = (r300_color_t *)&((v1)->ui[specoffset]);   \
-      spec0->red   = spec1->red;       \
-      spec0->green = spec1->green;     \
-      spec0->blue  = spec1->blue;      \
-   }                                                   \
+   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);       \
+   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);       \
+   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);       \
 } while (0)
 
-/* These don't need LE32_TO_CPU() as they used to save and restore
- * colors which are already in the correct format.
- */
-#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
-#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_SET_SPEC( v, c )    if (specoffset) R300_SPEC( v->ub4[specoffset], c )
+#define VERT_COPY_SPEC( v0, v1 ) if (specoffset) COPY_3V(v0->ub4[specoffset], v1->ub4[specoffset])
 #define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
 #define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
 
@@ -460,6 +484,7 @@ do {                                                        \
 #include "tnl_dd/t_dd_tritmp.h"
 
 
+
 static void init_rast_tab( void )
 {
    init();
@@ -510,56 +535,59 @@ static void init_rast_tab( void )
 /**********************************************************************/
 /*                    Choose render functions                         */
 /**********************************************************************/
-
-void r300ChooseRenderState( GLcontext *ctx )
+static void r300ChooseRenderState( GLcontext *ctx )
 {
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   r300ContextPtr rmesa = R300_CONTEXT(ctx);
-   GLuint index = 0;
-   GLuint flags = ctx->_TriangleCaps;
-
-   // if (!rmesa->TclFallback || rmesa->Fallback) 
-//      return;
-
-   if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
-   if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
-
-   if (index != rmesa->swtcl.RenderIndex) {
-      tnl->Driver.Render.Points = rast_tab[index].points;
-      tnl->Driver.Render.Line = rast_tab[index].line;
-      tnl->Driver.Render.ClippedLine = rast_tab[index].line;
-      tnl->Driver.Render.Triangle = rast_tab[index].triangle;
-      tnl->Driver.Render.Quad = rast_tab[index].quad;
-
-      if (index == 0) {
-       tnl->Driver.Render.PrimTabVerts = r300_render_tab_verts;
-        tnl->Driver.Render.PrimTabElts = r300_render_tab_elts;
-        tnl->Driver.Render.ClippedPolygon = r300_fast_clipped_poly;
-      } else {
-        tnl->Driver.Render.PrimTabVerts = _tnl_render_tab_verts;
-        tnl->Driver.Render.PrimTabElts = _tnl_render_tab_elts;
-        tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
-      }
-
-      rmesa->swtcl.RenderIndex = index;
-   }
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLuint index = 0;
+       GLuint flags = ctx->_TriangleCaps;
+
+       if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
+       if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
+
+       if (index != rmesa->swtcl.RenderIndex) {
+               tnl->Driver.Render.Points = rast_tab[index].points;
+               tnl->Driver.Render.Line = rast_tab[index].line;
+               tnl->Driver.Render.ClippedLine = rast_tab[index].line;
+               tnl->Driver.Render.Triangle = rast_tab[index].triangle;
+               tnl->Driver.Render.Quad = rast_tab[index].quad;
+
+               if (index == 0) {
+                       tnl->Driver.Render.PrimTabVerts = r300_render_tab_verts;
+                       tnl->Driver.Render.PrimTabElts = r300_render_tab_elts;
+                       tnl->Driver.Render.ClippedPolygon = r300_fast_clipped_poly;
+               } else {
+                       tnl->Driver.Render.PrimTabVerts = _tnl_render_tab_verts;
+                       tnl->Driver.Render.PrimTabElts = _tnl_render_tab_elts;
+                       tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
+               }
+
+               rmesa->swtcl.RenderIndex = index;
+       }
 }
 
 
 static void r300RenderStart(GLcontext *ctx)
 {
-  r300ContextPtr rmesa = R300_CONTEXT( ctx );
-       fprintf(stderr, "%s\n", __FUNCTION__);
+        r300ContextPtr rmesa = R300_CONTEXT( ctx );
+       //      fprintf(stderr, "%s\n", __FUNCTION__);
 
+       r300ChooseRenderState(ctx);     
        r300SetVertexFormat(ctx);
+
+       r300UpdateShaders(rmesa);
+       r300UpdateShaderStates(rmesa);
+
+       r300EmitCacheFlush(rmesa);
+       
        if (rmesa->dma.flush != 0 && 
            rmesa->dma.flush != flush_last_swtcl_prim)
-         rmesa->dma.flush( rmesa );
+               rmesa->dma.flush( rmesa );
+
 }
 
 static void r300RenderFinish(GLcontext *ctx)
 {
-       fprintf(stderr, "%s\n", __FUNCTION__);
 }
 
 static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
@@ -567,7 +595,7 @@ static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        
        if (rmesa->swtcl.hw_primitive != hwprim) {
-//             R300_NEWPRIM( rmesa );
+               R300_NEWPRIM( rmesa );
                rmesa->swtcl.hw_primitive = hwprim;
        }
 }
@@ -577,9 +605,12 @@ static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        rmesa->swtcl.render_primitive = prim;
-       if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
-               r300RasterPrimitive( ctx, reduced_prim[prim] );
-       fprintf(stderr, "%s\n", __FUNCTION__);
+
+       if ((prim == GL_TRIANGLES) && (ctx->_TriangleCaps & DD_TRI_UNFILLED))
+         return;
+
+       r300RasterPrimitive( ctx, reduced_prim[prim] );
+       //      fprintf(stderr, "%s\n", __FUNCTION__);
        
 }
 
@@ -610,7 +641,7 @@ void r300InitSwtcl(GLcontext *ctx)
        
        /* FIXME: what are these numbers? */
        _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
-                           36 * sizeof(GLfloat) );
+                           48 * sizeof(GLfloat) );
        
        rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
        rmesa->swtcl.RenderIndex = ~0;
@@ -621,15 +652,17 @@ void r300InitSwtcl(GLcontext *ctx)
        _tnl_invalidate_vertices( ctx, ~0 );
        RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
 
+       _tnl_need_projected_coords( ctx, GL_FALSE );
        r300ChooseRenderState(ctx);
+
+       _mesa_validate_all_lighting_tables( ctx ); 
+
+       tnl->Driver.NotifyMaterialChange = 
+         _mesa_validate_all_lighting_tables;
 }
 
 void r300DestroySwtcl(GLcontext *ctx)
 {
-       r300ContextPtr rmesa = R300_CONTEXT(ctx);
-//     if (rmesa->swtcl.indexed_verts.buf) 
-//             r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ );
-
 }
 
 void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset)
@@ -638,10 +671,10 @@ void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset)
        int cmd_written = 0;
 
        drm_radeon_cmd_header_t *cmd = NULL;
-       if (1)//RADEON_DEBUG & DEBUG_VERTS)
+       if (RADEON_DEBUG & DEBUG_VERTS)
          fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
                  __FUNCTION__, vertex_size, offset);
-       
+
        start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2), 2);
        e32(1);
        e32(vertex_size | (vertex_size << 8));
@@ -658,7 +691,6 @@ void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
 
        type = r300PrimitiveType(rmesa, primitive);
        num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
-       r300EmitState(rmesa);
        
        start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
        e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);