From: Roland Scheidegger <rscheidegger@gmx.ch>
Date: Fri, 2 Jun 2006 22:47:31 +0000 (+0000)
Subject: implement arb_vertex_program in hw for r200. Code contains still some hacks, generic... 
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=98c791b543c4ba86b8bb54488bd872b33b10b1aa;p=mesa.git

implement arb_vertex_program in hw for r200. Code contains still some hacks, generic attribs cause a fallback, but otherwise it seems to work quite well. Passes all glean vertProg1 tests with the exception of the degnerated LIT case (which is a hw limitation), as well as runs the r200 render path of doom3/quake4 (1.1 patch needed for quake4). The code is heavily borrowed from the r300 driver as vertex programs encoding is almost identical. arb_vertex_program is not yet announced by default and still needs to be enabled via driconf.
---

diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index 2084d52132e..75c09ff867e 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -31,6 +31,7 @@ DRIVER_SOURCES = r200_context.c \
 		 r200_vtxfmt_sse.c \
 		 r200_vtxfmt_x86.c \
 		 r200_fragshader.c \
+		 r200_vertprog.c \
 		 radeon_screen.c \
 		 $(EGL_SOURCES)
 
diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
index 6ce68708940..91737d2d33c 100644
--- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
@@ -110,6 +110,11 @@ void r200SetUpAtomList( r200ContextPtr rmesa )
    /* FIXME: is this a good place to insert that atom ? */
    insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.spr );
    insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.prf );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pvs );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[0] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[1] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[0] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[1] );
 }
 
 static void r200SaveHwState( r200ContextPtr rmesa )
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 24f0ea55102..3e41d863148 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -62,6 +62,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_tcl.h"
 #include "r200_vtxfmt.h"
 #include "r200_maos.h"
+#include "r200_vertprog.h"
 
 #define need_GL_ARB_multisample
 #define need_GL_ARB_texture_compression
@@ -76,7 +77,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_NV_vertex_program
 #include "extension_helper.h"
 
-#define DRIVER_DATE	"20060327"
+#define DRIVER_DATE	"20060602"
 
 #include "vblank.h"
 #include "utils.h"
@@ -310,6 +311,7 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    r200InitIoctlFuncs(&functions);
    r200InitStateFuncs(&functions);
    r200InitTextureFuncs(&functions);
+   r200InitShaderFuncs(&functions); 
 
    /* Allocate and initialize the Mesa context */
    if (sharedContextPrivate)
@@ -417,6 +419,12 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    ctx->Const.MaxLineWidthAA = 10.0;
    ctx->Const.LineWidthGranularity = 0.0625;
 
+   ctx->Const.VertexProgram.MaxNativeInstructions = R200_VSF_MAX_INST;
+   ctx->Const.VertexProgram.MaxNativeAttribs = 12;
+   ctx->Const.VertexProgram.MaxNativeTemps = R200_VSF_MAX_TEMPS;
+   ctx->Const.VertexProgram.MaxNativeParameters = R200_VSF_MAX_PARAM;
+   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+
    /* Initialize the software rasterizer and helper modules.
     */
    _swrast_CreateContext( ctx );
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index 0de131adc1d..2ff9b9f32e6 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -46,6 +46,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "mtypes.h"
 #include "colormac.h"
 #include "r200_reg.h"
+#include "r200_vertprog.h"
 
 #define ENABLE_HW_3D_TEXTURE 1  /* XXX this is temporary! */
 
@@ -94,6 +95,15 @@ typedef void (*r200_point_func)( r200ContextPtr,
 				   r200Vertex * );
 
 
+struct r200_vertex_program {
+        struct vertex_program mesa_program; /* Must be first */
+        int translated;
+        VERTEX_SHADER_INSTRUCTION instr[R200_VSF_MAX_INST + 2];
+        int pos_end;
+        int inputs[VERT_ATTRIB_MAX];
+        int native;
+};
+
 struct r200_colorbuffer_state {
    GLuint clear;
 #if 000
@@ -336,6 +346,34 @@ struct r200_state_atom {
 #define AFS_IA1                   4 /* 2f0c */
 #define AFS_STATE_SIZE           33
 
+#define PVS_CMD_0                 0
+#define PVS_CNTL_1                1
+#define PVS_CNTL_2                2
+#define PVS_STATE_SIZE            3
+
+/* those are quite big... */
+#define VPI_CMD_0                 0
+#define VPI_OPDST_0               1
+#define VPI_SRC0_0                2
+#define VPI_SRC1_0                3
+#define VPI_SRC2_0                4
+#define VPI_OPDST_63              253
+#define VPI_SRC0_63               254
+#define VPI_SRC1_63               255
+#define VPI_SRC2_63               256
+#define VPI_STATE_SIZE            257
+
+#define VPP_CMD_0                0
+#define VPP_PARAM0_0             1
+#define VPP_PARAM1_0             2
+#define VPP_PARAM2_0             3
+#define VPP_PARAM3_0             4
+#define VPP_PARAM0_95            381
+#define VPP_PARAM1_95            382
+#define VPP_PARAM2_95            383
+#define VPP_PARAM3_95            384
+#define VPP_STATE_SIZE           385
+
 #define TCL_CMD_0                 0
 #define TCL_LIGHT_MODEL_CTL_0     1
 #define TCL_LIGHT_MODEL_CTL_1     2
@@ -567,6 +605,9 @@ struct r200_hw_state {
    struct r200_state_atom glt;
    struct r200_state_atom prf;
    struct r200_state_atom afs[2];
+   struct r200_state_atom pvs;
+   struct r200_state_atom vpi[2];
+   struct r200_state_atom vpp[2];
    struct r200_state_atom atf;
    struct r200_state_atom spr;
 
@@ -883,6 +924,7 @@ struct r200_context {
     */
    struct r200_hw_state hw;
    struct r200_state state;
+   struct r200_vertex_program *curr_vp_hw;
 
    /* Texture object bookkeeping
     */
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 41c7607b61d..d266e78910e 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -54,6 +54,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_tex.h"
 #include "r200_swtcl.h"
 #include "r200_vtxfmt.h"
+#include "r200_vertprog.h"
 
 #include "drirenderbuffer.h"
 
@@ -2100,7 +2101,71 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
       break;
 
    case GL_VERTEX_PROGRAM_ARB:
-      TCL_FALLBACK(rmesa->glCtx, R200_TCL_FALLBACK_VERTEX_PROGRAM, state);
+      if (!state) {
+	 GLuint i;
+	 R200_STATECHANGE( rmesa, vap );
+	 rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_PROG_VTX_SHADER_ENABLE;
+	 /* mark all tcl atoms (tcl vector state got overwritten) dirty
+	    not sure about tcl scalar state - we need at least grd
+	    with vert progs too.
+	    ucp looks like it doesn't get overwritten (may even work
+	    with vp for pos-invariant progs if we're lucky) */
+	 R200_STATECHANGE( rmesa, mtl[0] );
+	 R200_STATECHANGE( rmesa, mtl[1] );
+	 R200_STATECHANGE( rmesa, fog );
+	 R200_STATECHANGE( rmesa, glt );
+	 R200_STATECHANGE( rmesa, eye );
+	 for (i = R200_MTX_MV; i <= R200_MTX_TEX5; i++) {
+	    R200_STATECHANGE( rmesa, mat[i] );
+	 }
+	 for (i = 0 ; i < 8; i++) {
+	    R200_STATECHANGE( rmesa, lit[i] );
+	 }
+	 R200_STATECHANGE( rmesa, tcl );
+	 for (i = 0; i <= ctx->Const.MaxClipPlanes; i++) {
+	    if (ctx->Transform.ClipPlanesEnabled & (1 << i)) {
+	       rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (R200_UCP_ENABLE_0 << i);
+	    }
+/*	    else {
+	       rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(R200_UCP_ENABLE_0 << i);
+	    }*/
+	 }
+	 /* FIXME: ugly as hell. need to call everything which might change tcl_output_vtxfmt0/1 and compsel */
+	 r200UpdateSpecular( ctx );
+	 r200Fogfv( ctx, GL_FOG_COORD_SRC, NULL );
+	/* shouldn't be necessary, as it's picked up anyway in r200ValidateState (_NEW_PROGRAM),
+	   but without it doom3 locks up at always the same places. Why? */
+	 r200UpdateTextureState( ctx );
+	 /* if we call r200UpdateTextureState we need the code below because we are calling it with
+	    non-current derived enabled values which may revert the state atoms for frag progs even when
+	    they already got disabled... ugh
+	    Should really figure out why we need to call r200UpdateTextureState in the first place */
+	 GLuint unit;
+	 for (unit = 0; unit < R200_MAX_TEXTURE_UNITS; unit++) {
+	    R200_STATECHANGE( rmesa, pix[unit] );
+	    R200_STATECHANGE( rmesa, tex[unit] );
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &=
+		~(R200_TXFORMAT_ST_ROUTE_MASK | R200_TXFORMAT_LOOKUP_DISABLE);
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT;
+	    /* need to guard this with drmSupportsFragmentShader? Should never get here if
+	       we don't announce ATI_fs, right? */
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXMULTI_CTL] = 0;
+         }
+	 R200_STATECHANGE( rmesa, cst );
+	 R200_STATECHANGE( rmesa, tf );
+	 rmesa->hw.cst.cmd[CST_PP_CNTL_X] = 0;
+      }
+      else {
+	 R200_STATECHANGE( rmesa, vap );
+	 if (!rmesa->TclFallback) {
+	 /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it? */
+	    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE /*| R200_VAP_SINGLE_BUF_STATE_ENABLE*/;
+	 }
+	 R200_STATECHANGE( rmesa, vpi[0] );
+	 R200_STATECHANGE( rmesa, vpi[1] );
+	 R200_STATECHANGE( rmesa, vpp[0] );
+	 R200_STATECHANGE( rmesa, vpp[1] );
+      }
       break;
 
    case GL_FRAGMENT_SHADER_ATI:
@@ -2310,6 +2375,8 @@ void r200ValidateState( GLcontext *ctx )
       r200UpdateLocalViewer( ctx );
    }
 
+/* FIXME: don't really need most of these when vertex progs are enabled */
+
    /* Need an event driven matrix update?
     */
    if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
@@ -2340,6 +2407,16 @@ void r200ValidateState( GLcontext *ctx )
 	 r200UpdateClipPlanes( ctx );
    }
 
+   if (new_state & (_NEW_PROGRAM|
+   /* need to test for pretty much anything due to possible parameter bindings */
+	_NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM|
+	_NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX|
+	_NEW_FOG|_NEW_POINT|_NEW_TRACK_MATRIX)) {
+      if (ctx->VertexProgram._Enabled) {
+	 r200SetupVertexProg( ctx );
+      }
+      else TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, 0);
+   }
 
    rmesa->NewGLState = 0;
 }
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index 14616b09f41..266beb61b6a 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -93,6 +93,19 @@ static int cmdvec( int offset, int stride, int count )
    return h.i;
 }
 
+/* warning: the count here is divided by 4 compared to other cmds
+   (so it doesn't exceed the char size)! */
+static int cmdveclinear( int offset, int count ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.veclinear.cmd_type = RADEON_CMD_VECLINEAR;
+   h.veclinear.addr_lo = offset & 0xff;
+   h.veclinear.addr_hi = (offset & 0xff00) >> 8;
+   h.veclinear.count = count;
+   return h.i;
+}
+
 static int cmdscl( int offset, int stride, int count ) 
 {
    drm_radeon_cmd_header_t h;
@@ -126,12 +139,27 @@ static GLboolean check_##NM( GLcontext *ctx, int idx )	\
 
 #define TCL_CHECK( NM, FLAG )				\
 static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+{							\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   (void) idx;						\
+   return !rmesa->TclFallback && !ctx->VertexProgram._Enabled && (FLAG);	\
+}
+
+#define TCL_OR_VP_CHECK( NM, FLAG )			\
+static GLboolean check_##NM( GLcontext *ctx, int idx )	\
 {							\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
    (void) idx;						\
    return !rmesa->TclFallback && (FLAG);		\
 }
 
+#define VP_CHECK( NM, FLAG )				\
+static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+{							\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   (void) idx;						\
+   return !rmesa->TclFallback && ctx->VertexProgram._Enabled && (FLAG);		\
+}
 
 
 CHECK( always, GL_TRUE )
@@ -150,7 +178,11 @@ TCL_CHECK( tcl, GL_TRUE )
 TCL_CHECK( tcl_tex, rmesa->state.texture.unit[idx].unitneeded )
 TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
 TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[idx].Enabled )
-TCL_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) )
+TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) )
+TCL_OR_VP_CHECK( tcl_or_vp, GL_TRUE )
+VP_CHECK( tcl_vp, GL_TRUE )
+VP_CHECK( tcl_vp_size, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64 )
+VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 96 )
 
 
 /* Initialize the context's hardware state.
@@ -307,13 +339,27 @@ void r200InitState( r200ContextPtr rmesa )
       ALLOC_STATE( cube[4], never, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
       ALLOC_STATE( cube[5], never, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
    }
-
-   ALLOC_STATE( tcl, tcl, TCL_STATE_SIZE, "TCL/tcl", 0 );
+   if (rmesa->r200Screen->drmSupportsVertexProgram) {
+      ALLOC_STATE( pvs, tcl_vp, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
+      ALLOC_STATE( vpi[0], tcl_vp, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
+      ALLOC_STATE( vpi[1], tcl_vp_size, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
+      ALLOC_STATE( vpp[0], tcl_vp, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 );
+      ALLOC_STATE( vpp[1], tcl_vpp_size, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 );
+   }
+   else {
+      ALLOC_STATE( pvs, never, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
+      ALLOC_STATE( vpi[0], never, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
+      ALLOC_STATE( vpi[1], never, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
+      ALLOC_STATE( vpp[0], never, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 );
+      ALLOC_STATE( vpp[1], never, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 );
+   }
+   /* FIXME: this atom has two commands, we need only one (ucp_vert_blend) for vp */
+   ALLOC_STATE( tcl, tcl_or_vp, TCL_STATE_SIZE, "TCL/tcl", 0 );
    ALLOC_STATE( msl, tcl, MSL_STATE_SIZE, "MSL/matrix-select", 0 );
    ALLOC_STATE( tcg, tcl, TCG_STATE_SIZE, "TCG/texcoordgen", 0 );
    ALLOC_STATE( mtl[0], tcl_lighting, MTL_STATE_SIZE, "MTL0/material0", 0 );
    ALLOC_STATE( mtl[1], tcl_lighting, MTL_STATE_SIZE, "MTL1/material1", 1 );
-   ALLOC_STATE( grd, tcl, GRD_STATE_SIZE, "GRD/guard-band", 0 );
+   ALLOC_STATE( grd, tcl_or_vp, GRD_STATE_SIZE, "GRD/guard-band", 0 );
    ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 0 );
    ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 0 );
    ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 0 );
@@ -411,6 +457,7 @@ void r200InitState( r200ContextPtr rmesa )
    }
    rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_0);
    rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_1);
+   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(R200_EMIT_VAP_PVS_CNTL);
    rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_0);
    rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_0);
    rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_1);
@@ -450,6 +497,15 @@ void r200InitState( r200ContextPtr rmesa )
    rmesa->hw.mtl[1].cmd[MTL_CMD_1] =
       cmdscl2( R200_SS_MAT_1_SHININESS, 1, 1 );
 
+   rmesa->hw.vpi[0].cmd[VPI_CMD_0] =
+      cmdveclinear( R200_PVS_PROG0, 64 );
+   rmesa->hw.vpi[1].cmd[VPI_CMD_0] =
+      cmdveclinear( R200_PVS_PROG1, 64 );
+   rmesa->hw.vpp[0].cmd[VPP_CMD_0] =
+      cmdveclinear( R200_PVS_PARAM0, 96 );
+   rmesa->hw.vpp[1].cmd[VPP_CMD_0] =
+      cmdveclinear( R200_PVS_PARAM1, 96 );
+
    rmesa->hw.grd.cmd[GRD_CMD_0] = 
       cmdscl( R200_SS_VERT_GUARD_CLIP_ADJ_ADDR, 1, 4 );
    rmesa->hw.fog.cmd[FOG_CMD_0] = 
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index c41622debe5..2530e1b1046 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -390,27 +390,86 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
    if (rmesa->NewGLState)
       r200ValidateState( ctx );
 
+   if (!ctx->VertexProgram._Enabled) {
    /* NOTE: inputs != tnl->render_inputs - these are the untransformed
     * inputs.
     */
-   if (ctx->Light.Enabled) {
-      inputs |= VERT_BIT_NORMAL;
-   }
+      if (ctx->Light.Enabled) {
+	 inputs |= VERT_BIT_NORMAL;
+      }
 
-   if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) {
-      inputs |= VERT_BIT_COLOR1;
-   }
+      if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) {
+	 inputs |= VERT_BIT_COLOR1;
+      }
 
-   if ( (ctx->Fog.FogCoordinateSource == GL_FOG_COORD) && ctx->Fog.Enabled ) {
-      inputs |= VERT_BIT_FOG;
-   }
+      if ( (ctx->Fog.FogCoordinateSource == GL_FOG_COORD) && ctx->Fog.Enabled ) {
+	 inputs |= VERT_BIT_FOG;
+      }
 
-   for (i = 0 ; i < ctx->Const.MaxTextureUnits; i++) {
-      if (ctx->Texture.Unit[i]._ReallyEnabled) {
-	 if (rmesa->TexGenNeedNormals[i]) {
-	    inputs |= VERT_BIT_NORMAL;
+      for (i = 0 ; i < ctx->Const.MaxTextureUnits; i++) {
+	 if (ctx->Texture.Unit[i]._ReallyEnabled) {
+	    if (rmesa->TexGenNeedNormals[i]) {
+	       inputs |= VERT_BIT_NORMAL;
+	    }
+	    inputs |= VERT_BIT_TEX(i);
 	 }
-	 inputs |= VERT_BIT_TEX(i);
+      }
+   }
+   else {
+      GLuint out_vtxfmt0 = 0;
+      GLuint out_vtxfmt1 = 0;
+      GLuint out_compsel = 0;
+      GLuint vp_out = rmesa->curr_vp_hw->mesa_program.Base.OutputsWritten;
+      /* can't handle other inputs, generic attribs etc. currently - should never arrive here */
+      assert ((rmesa->curr_vp_hw->mesa_program.Base.InputsRead &
+	 ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
+	  VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
+	  VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) == 0);
+      inputs |= rmesa->curr_vp_hw->mesa_program.Base.InputsRead;
+      /* FIXME: this is a mess. Not really sure how to set up TCL_OUTPUT_VTXFMT
+	 in "undefined" cases (e.g. output needed later but not written by vertex program or vice versa)
+	 - however misconfiguration here will almost certainly lock up the chip.
+	 I think at the very least we need to enable tcl outputs which we write to. Maybe even need to
+	 fix up a vertex program so an output needed later always gets written?
+	 For now just set the compsel and output_vtxfmt to the outputs written.
+	 However, for simplicity we assume always all 4 values are written which may not be correct
+	 (but I don't know if it could lead to lockups). */
+      assert(vp_out & (1 << VERT_RESULT_HPOS));
+      out_vtxfmt0 = R200_VTX_XY | R200_VTX_Z0 | R200_VTX_W0;
+      /* FIXME: need to always enable color_0 otherwise doom3's shadow vp (?) will lock up (?) */
+      out_vtxfmt0 |= R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT;
+      out_compsel = R200_OUTPUT_XYZW;
+      if (vp_out & (1 << VERT_RESULT_COL0)) {
+	 out_vtxfmt0 |= R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT;
+	 out_compsel |= R200_OUTPUT_COLOR_0;
+      }
+      if (vp_out & (1 << VERT_RESULT_COL1)) {
+	 out_vtxfmt0 |= R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT;
+	 out_compsel |= R200_OUTPUT_COLOR_1;
+      }
+      /* FIXME: probably not everything is set up for fogc and psiz to work correctly */
+      if (vp_out & (1 << VERT_RESULT_FOGC)) {
+	 out_vtxfmt0 |= R200_VTX_DISCRETE_FOG;
+         out_compsel |= R200_OUTPUT_DISCRETE_FOG;
+      }
+      if (vp_out & (1 << VERT_RESULT_PSIZ)) {
+	 out_vtxfmt0 |= R200_VTX_POINT_SIZE;
+	 out_compsel |= R200_OUTPUT_PT_SIZE;
+      }
+      for (i = VERT_RESULT_TEX0; i < VERT_RESULT_TEX6; i++) {
+	 if (vp_out & (1 << i)) {
+	    out_vtxfmt1 |= 4  << ((i - VERT_RESULT_TEX0) * 3);
+	    out_compsel |= R200_OUTPUT_TEX_0 << (i - VERT_RESULT_TEX0);
+	 }
+      }
+      if ((rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] != out_vtxfmt0) ||
+	 (rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] != out_vtxfmt1) ||
+	 (rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] != out_compsel)) {
+	 R200_STATECHANGE( rmesa, vtx );
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_vtxfmt0;
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] = out_vtxfmt1;
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] = out_compsel;
+	 /* FIXME: should restore this when disabling vertex programs maybe? */
       }
    }
 
@@ -486,7 +545,8 @@ static void transition_to_swtnl( GLcontext *ctx )
     * need to put the card into D3D mode to make it work:
     */
    R200_STATECHANGE( rmesa, vap );
-   rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_TCL_ENABLE;
+   /* not sure if it's strictly necessary to disable VAP_PROG_VTX_SHADER_ENABLE in addition to VAP_TCL_ENABLE) */
+   rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~(R200_VAP_TCL_ENABLE|R200_VAP_PROG_VTX_SHADER_ENABLE);
 }
 
 static void transition_to_hwtnl( GLcontext *ctx )
@@ -513,6 +573,10 @@ static void transition_to_hwtnl( GLcontext *ctx )
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_TCL_ENABLE;
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_FORCE_W_TO_ONE;
 
+   if (ctx->VertexProgram._Enabled) {
+      rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE;
+   }
+
    if ( ((rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] & R200_FOG_USE_MASK)
       == R200_FOG_USE_SPEC_ALPHA) &&
       (ctx->Fog.FogCoordinateSource == GL_FOG_COORD )) {
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
new file mode 100644
index 00000000000..cd008df7aab
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -0,0 +1,1180 @@
+/**************************************************************************
+
+Copyright (C) 2005 Aapo Tahkola.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Aapo Tahkola <aet@rasterburn.org>
+ */
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "program.h"
+
+#include "r200_context.h"
+#include "r200_vertprog.h"
+#include "r200_ioctl.h"
+#include "r200_tcl.h"
+#include "program_instruction.h"
+#include "tnl/tnl.h"
+
+#define SCALAR_FLAG (1<<31)
+#define FLAG_MASK (1<<31)
+#define OP_MASK (0xf)  /* we are unlikely to have more than 15 */
+#define OPN(operator, ip, op) {#operator, OPCODE_##operator, ip, op}
+
+static struct{
+   char *name;
+   int opcode;
+   unsigned long ip; /* number of input operands and flags */
+   unsigned long op;
+}op_names[]={
+   OPN(ABS, 1, 1),
+   OPN(ADD, 2, 1),
+   OPN(ARL, 1, 1|SCALAR_FLAG),
+   OPN(DP3, 2, 3|SCALAR_FLAG),
+   OPN(DP4, 2, 3|SCALAR_FLAG),
+   OPN(DPH, 2, 3|SCALAR_FLAG),
+   OPN(DST, 2, 1),
+   OPN(EX2, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
+   OPN(EXP, 1|SCALAR_FLAG, 1),
+   OPN(FLR, 1, 1),
+   OPN(FRC, 1, 1),
+   OPN(LG2, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
+   OPN(LIT, 1, 1),
+   OPN(LOG, 1|SCALAR_FLAG, 1),
+   OPN(MAD, 3, 1),
+   OPN(MAX, 2, 1),
+   OPN(MIN, 2, 1),
+   OPN(MOV, 1, 1),
+   OPN(MUL, 2, 1),
+   OPN(POW, 2|SCALAR_FLAG, 4|SCALAR_FLAG),
+   OPN(RCP, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
+   OPN(RSQ, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
+   OPN(SGE, 2, 1),
+   OPN(SLT, 2, 1),
+   OPN(SUB, 2, 1),
+   OPN(SWZ, 1, 1),
+   OPN(XPD, 2, 1),
+   OPN(RCC, 0, 0), //extra
+   OPN(PRINT, 0, 0),
+   OPN(END, 0, 0),
+};
+#undef OPN
+
+static GLboolean r200VertexProgUpdateParams(GLcontext *ctx, struct r200_vertex_program *vp)
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   GLfloat *fcmd = (GLfloat *)&rmesa->hw.vpp[0].cmd[VPP_CMD_0 + 1];
+   int pi;
+   struct vertex_program *mesa_vp = (void *)vp;
+   struct program_parameter_list *paramList;
+   drm_radeon_cmd_header_t tmp;
+
+   R200_STATECHANGE( rmesa, vpp[0] );
+   R200_STATECHANGE( rmesa, vpp[1] );
+   assert(mesa_vp->Base.Parameters);
+   _mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
+   paramList = mesa_vp->Base.Parameters;
+
+   if(paramList->NumParameters > R200_VSF_MAX_PARAM){
+      fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   for(pi = 0; pi < paramList->NumParameters; pi++) {
+      switch(paramList->Parameters[pi].Type) {
+      case PROGRAM_STATE_VAR:
+      case PROGRAM_NAMED_PARAM:
+      //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
+      case PROGRAM_CONSTANT:
+	 *fcmd++ = paramList->ParameterValues[pi][0];
+	 *fcmd++ = paramList->ParameterValues[pi][1];
+	 *fcmd++ = paramList->ParameterValues[pi][2];
+	 *fcmd++ = paramList->ParameterValues[pi][3];
+	 break;
+      default:
+	 _mesa_problem(NULL, "Bad param type in %s", __FUNCTION__);
+	 break;
+      }
+      if (pi == 95) {
+	 fcmd = (GLfloat *)rmesa->hw.vpp[1].cmd[VPP_CMD_0 + 1];
+      }
+   }
+   /* hack up the cmd_size so not the whole state atom is emitted always. */
+   rmesa->hw.vpp[0].cmd_size =
+      1 + 4 * ((paramList->NumParameters > 96) ? 96 : paramList->NumParameters);
+   tmp.i = rmesa->hw.vpp[0].cmd[VPP_CMD_0];
+   tmp.veclinear.count = (paramList->NumParameters > 96) ? 96 : paramList->NumParameters;
+   rmesa->hw.vpp[0].cmd[VPP_CMD_0] = tmp.i;
+   if (paramList->NumParameters > 96) {
+      rmesa->hw.vpp[1].cmd_size = 1 + 4 * (paramList->NumParameters - 96);
+      tmp.i = rmesa->hw.vpp[1].cmd[VPP_CMD_0];
+      tmp.veclinear.count = paramList->NumParameters - 96;
+      rmesa->hw.vpp[1].cmd[VPP_CMD_0] = tmp.i;
+   }
+   return GL_TRUE;
+}
+
+static unsigned long t_dst_mask(GLuint mask)
+{
+   unsigned long flags = 0;
+
+   if(mask & WRITEMASK_X) flags |= VSF_FLAG_X;
+   if(mask & WRITEMASK_Y) flags |= VSF_FLAG_Y;
+   if(mask & WRITEMASK_Z) flags |= VSF_FLAG_Z;
+   if(mask & WRITEMASK_W) flags |= VSF_FLAG_W;
+
+   return flags;
+}
+
+static unsigned long t_dst(struct prog_dst_register *dst)
+{
+   switch(dst->File) {
+   case PROGRAM_TEMPORARY:
+      return ((dst->Index << R200_VPI_OUT_REG_INDEX_SHIFT)
+	 | R200_VSF_OUT_CLASS_TMP);
+   case PROGRAM_OUTPUT:
+      switch (dst->Index) {
+      case VERT_RESULT_HPOS:
+	 return R200_VSF_OUT_CLASS_RESULT_POS;
+      case VERT_RESULT_COL0:
+	 return R200_VSF_OUT_CLASS_RESULT_COLOR;
+      case VERT_RESULT_COL1:
+	 return ((1 << R200_VPI_OUT_REG_INDEX_SHIFT)
+	    | R200_VSF_OUT_CLASS_RESULT_COLOR);
+      case VERT_RESULT_FOGC:
+	 return R200_VSF_OUT_CLASS_RESULT_FOGC;
+      case VERT_RESULT_TEX0:
+      case VERT_RESULT_TEX1:
+      case VERT_RESULT_TEX2:
+      case VERT_RESULT_TEX3:
+      case VERT_RESULT_TEX4:
+      case VERT_RESULT_TEX5:
+	 return (((dst->Index - VERT_RESULT_TEX0) << R200_VPI_OUT_REG_INDEX_SHIFT)
+	    | R200_VSF_OUT_CLASS_RESULT_TEXC);
+      case VERT_RESULT_PSIZ:
+	 return R200_VSF_OUT_CLASS_RESULT_POINTSIZE;
+      default:
+	 fprintf(stderr, "problem in %s, unknown dst output reg %d\n", __FUNCTION__, dst->Index);
+	 exit(0);
+	 return 0;
+      }
+   case PROGRAM_ADDRESS:
+      assert (dst->Index == 0);
+      return R200_VSF_OUT_CLASS_ADDR;
+   default:
+      fprintf(stderr, "problem in %s, unknown register type %d\n", __FUNCTION__, dst->File);
+      exit(0);
+      return 0;
+   }
+}
+
+static unsigned long t_src_class(enum register_file file)
+{
+
+   switch(file){
+   case PROGRAM_TEMPORARY:
+      return VSF_IN_CLASS_TMP;
+
+   case PROGRAM_INPUT:
+      return VSF_IN_CLASS_ATTR;
+
+   case PROGRAM_LOCAL_PARAM:
+   case PROGRAM_ENV_PARAM:
+   case PROGRAM_NAMED_PARAM:
+   case PROGRAM_STATE_VAR:
+      return VSF_IN_CLASS_PARAM;
+   /*
+   case PROGRAM_OUTPUT:
+   case PROGRAM_WRITE_ONLY:
+   case PROGRAM_ADDRESS:
+   */
+   default:
+      fprintf(stderr, "problem in %s", __FUNCTION__);
+      exit(0);
+   }
+}
+
+static __inline unsigned long t_swizzle(GLubyte swizzle)
+{
+/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+/*
+   switch(swizzle){
+   case SWIZZLE_X: return VSF_IN_COMPONENT_X;
+   case SWIZZLE_Y: return VSF_IN_COMPONENT_Y;
+   case SWIZZLE_Z: return VSF_IN_COMPONENT_Z;
+   case SWIZZLE_W: return VSF_IN_COMPONENT_W;
+   case SWIZZLE_ZERO: return VSF_IN_COMPONENT_ZERO;
+   case SWIZZLE_ONE: return VSF_IN_COMPONENT_ONE;
+   default:
+      fprintf(stderr, "problem in %s", __FUNCTION__);
+      exit(0);
+   }
+*/
+   return swizzle;
+}
+
+#if 0
+static void vp_dump_inputs(struct r200_vertex_program *vp, char *caller)
+{
+   int i;
+
+   if(vp == NULL){
+      fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__, caller);
+      return ;
+   }
+
+   fprintf(stderr, "%s:<", caller);
+   for(i=0; i < VERT_ATTRIB_MAX; i++)
+   fprintf(stderr, "%d ", vp->inputs[i]);
+   fprintf(stderr, ">\n");
+
+}
+#endif
+
+static unsigned long t_src_index(struct r200_vertex_program *vp, struct prog_src_register *src)
+{
+/*
+   int i;
+   int max_reg = -1;
+*/
+   if(src->File == PROGRAM_INPUT){
+/*      if(vp->inputs[src->Index] != -1)
+	 return vp->inputs[src->Index];
+
+      for(i=0; i < VERT_ATTRIB_MAX; i++)
+	 if(vp->inputs[i] > max_reg)
+	    max_reg = vp->inputs[i];
+
+      vp->inputs[src->Index] = max_reg+1;*/
+
+      //vp_dump_inputs(vp, __FUNCTION__);	
+      assert(vp->inputs[src->Index] != -1);
+      return vp->inputs[src->Index];
+   } else {
+      if (src->Index < 0) {
+	 fprintf(stderr, "WARNING negative offsets for indirect addressing do not work\n");
+	 return 0;
+      }
+      return src->Index;
+   }
+}
+
+static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_register *src)
+{
+
+   return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 1)),
+			t_swizzle(GET_SWZ(src->Swizzle, 2)),
+			t_swizzle(GET_SWZ(src->Swizzle, 3)),
+			t_src_class(src->File),
+			src->NegateBase) | (src->RelAddr << 4);
+}
+
+static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src)
+{
+
+   return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_src_class(src->File),
+			src->NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
+}
+
+static unsigned long t_opcode(enum prog_opcode opcode)
+{
+
+   switch(opcode){
+   case OPCODE_DST: return R200_VPI_OUT_OP_DST;
+   case OPCODE_EX2: return R200_VPI_OUT_OP_EX2;
+   case OPCODE_EXP: return R200_VPI_OUT_OP_EXP;
+   case OPCODE_FRC: return R200_VPI_OUT_OP_FRC;
+   case OPCODE_LG2: return R200_VPI_OUT_OP_LG2;
+   case OPCODE_LOG: return R200_VPI_OUT_OP_LOG;
+   case OPCODE_MAX: return R200_VPI_OUT_OP_MAX;
+   case OPCODE_MIN: return R200_VPI_OUT_OP_MIN;
+   case OPCODE_MUL: return R200_VPI_OUT_OP_MUL;
+   case OPCODE_RCP: return R200_VPI_OUT_OP_RCP;
+   case OPCODE_RSQ: return R200_VPI_OUT_OP_RSQ;
+   case OPCODE_SGE: return R200_VPI_OUT_OP_SGE;
+   case OPCODE_SLT: return R200_VPI_OUT_OP_SLT;
+   case OPCODE_DP4: return R200_VPI_OUT_OP_DOT;
+   case OPCODE_ADD: return R200_VPI_OUT_OP_ADD;
+
+   default: 
+      fprintf(stderr, "%s: Should not be called with opcode %d!", __FUNCTION__, opcode);
+   }
+   exit(-1);
+   return 0;
+}
+
+static unsigned long op_operands(enum prog_opcode opcode)
+{
+   int i;
+
+   /* Can we trust mesas opcodes to be in order ? */
+   for(i=0; i < sizeof(op_names) / sizeof(*op_names); i++)
+      if(op_names[i].opcode == opcode)
+	 return op_names[i].ip;
+
+   fprintf(stderr, "op %d not found in op_names\n", opcode);
+   exit(-1);
+   return 0;
+}
+
+/* TODO: Get rid of t_src_class call */
+#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
+		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
+			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
+			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
+			 t_src_class(b.File) == VSF_IN_CLASS_ATTR)))) \
+
+/* fglrx on rv250 codes up unused sources as follows:
+   unused but necessary sources are same as previous source, zero-ed out.
+   unnecessary sources are same as previous source but with VSF_IN_CLASS_NONE set.
+   i.e. an add (2 args) has its 2nd arg (if you use it as mov) zero-ed out, and 3rd arg
+   set to VSF_IN_CLASS_NONE. Not sure if strictly necessary. */
+
+/* use these simpler definitions. Must obviously not be used with not yet set up regs.
+   Those are NOT semantically equivalent to the r300 ones, requires code changes */
+#define ZERO_SRC_0 (((o_inst->src0 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
+				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
+
+#define ZERO_SRC_1 (((o_inst->src1 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
+				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
+
+#define ZERO_SRC_2 (((o_inst->src2 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
+				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
+
+#define UNUSED_SRC_0 ((o_inst->src0 & ~15) | 9)
+
+#define UNUSED_SRC_1 ((o_inst->src1 & ~15) | 9)
+
+#define UNUSED_SRC_2 ((o_inst->src2 & ~15) | 9)
+
+
+/* DP4 version seems to trigger some hw peculiarity - fglrx does this on r200 however */
+#define PREFER_DP4
+
+static GLboolean r200_translate_vertex_program(struct r200_vertex_program *vp)
+{
+   struct vertex_program *mesa_vp = (void *)vp;
+   struct prog_instruction *vpi;
+   int i;
+   VERTEX_SHADER_INSTRUCTION *o_inst;
+   unsigned long operands;
+   int are_srcs_scalar;
+   unsigned long hw_op;
+
+   vp->native = GL_FALSE;
+
+   if ((mesa_vp->Base.InputsRead &
+      ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
+      VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
+      VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) {
+      if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 fprintf(stderr, "can't handle vert prog inputs 0x%x\n",
+	    mesa_vp->Base.InputsRead);
+      }
+      return GL_FALSE;
+   }
+
+   /* Initial value should be last tmp reg that hw supports.
+      Strangely enough r300 doesnt mind even though these would be out of range.
+      Smart enough to realize that it doesnt need it? */
+   int u_temp_i = R200_VSF_MAX_TEMPS - 1;
+   struct prog_src_register src[3];
+
+/*   if (getenv("R300_VP_SAFETY")) {
+      WARN_ONCE("R300_VP_SAFETY enabled.\n");
+
+      vpi = malloc((mesa_vp->Base.NumInstructions + VSF_MAX_FRAGMENT_TEMPS) * sizeof(struct prog_instruction));
+      memset(vpi, 0, VSF_MAX_FRAGMENT_TEMPS * sizeof(struct prog_instruction));
+
+      for (i=0; i < VSF_MAX_FRAGMENT_TEMPS; i++) {
+	 vpi[i].Opcode = OPCODE_MOV;
+	 vpi[i].StringPos = 0;
+	 vpi[i].Data = 0;
+
+	 vpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	 vpi[i].DstReg.Index = i;
+	 vpi[i].DstReg.WriteMask = WRITEMASK_XYZW;
+	 vpi[i].DstReg.CondMask = COND_TR;
+
+	 vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+	 vpi[i].SrcReg[0].Index = 0;
+	 vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE);
+      }
+
+      memcpy(&vpi[i], mesa_vp->Base.Instructions, mesa_vp->Base.NumInstructions * sizeof(struct prog_instruction));
+
+      free(mesa_vp->Base.Instructions);
+
+      mesa_vp->Base.Instructions = vpi;
+
+      mesa_vp->Base.NumInstructions += VSF_MAX_FRAGMENT_TEMPS;
+      vpi = &mesa_vp->Base.Instructions[mesa_vp->Base.NumInstructions-1];
+
+      assert(vpi->Opcode == OPCODE_END);
+   }*/
+/* FIXME: is changing the prog safe to do here? */
+   if (mesa_vp->IsPositionInvariant) {
+      struct program_parameter_list *paramList;
+      GLint tokens[6] = { STATE_MATRIX, STATE_MVP, 0, 0, 0, STATE_MATRIX };
+
+#ifdef PREFER_DP4
+      tokens[5] = STATE_MATRIX;
+#else
+      tokens[5] = STATE_MATRIX_TRANSPOSE;
+#endif
+      paramList = mesa_vp->Base.Parameters;
+
+      vpi = malloc((mesa_vp->Base.NumInstructions + 4) * sizeof(struct prog_instruction));
+      memset(vpi, 0, 4 * sizeof(struct prog_instruction));
+
+      for (i=0; i < 4; i++) {
+	 GLint idx;
+	 tokens[3] = tokens[4] = i;
+	 idx = _mesa_add_state_reference(paramList, tokens);
+#ifdef PREFER_DP4
+	 vpi[i].Opcode = OPCODE_DP4;
+	 vpi[i].StringPos = 0;
+	 vpi[i].Data = 0;
+
+	 vpi[i].DstReg.File = PROGRAM_OUTPUT;
+	 vpi[i].DstReg.Index = VERT_RESULT_HPOS;
+	 vpi[i].DstReg.WriteMask = 1 << i;
+	 vpi[i].DstReg.CondMask = COND_TR;
+
+	 vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+	 vpi[i].SrcReg[0].Index = idx;
+	 vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W);
+
+	 vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+	 vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+	 vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W);
+#else
+	 if (i == 0)
+	    vpi[i].Opcode = OPCODE_MUL;
+	 else
+	    vpi[i].Opcode = OPCODE_MAD;
+
+	 vpi[i].StringPos = 0;
+	 vpi[i].Data = 0;
+
+	 if (i == 3)
+	    vpi[i].DstReg.File = PROGRAM_OUTPUT;
+	 else
+	    vpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	 vpi[i].DstReg.Index = 0;
+	 vpi[i].DstReg.WriteMask = 0xf;
+	 vpi[i].DstReg.CondMask = COND_TR;
+
+	 vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+	 vpi[i].SrcReg[0].Index = idx;
+	 vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W);
+
+	 vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+	 vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+	 vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(i, i, i, i);
+
+	 if (i > 0) {
+	    vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY;
+	    vpi[i].SrcReg[2].Index = 0;
+	    vpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W);
+	 }
+#endif	
+      }
+
+      memcpy(&vpi[i], mesa_vp->Base.Instructions, mesa_vp->Base.NumInstructions * sizeof(struct prog_instruction));
+
+      free(mesa_vp->Base.Instructions);
+
+      mesa_vp->Base.Instructions = vpi;
+
+      mesa_vp->Base.NumInstructions += 4;
+      vpi = &mesa_vp->Base.Instructions[mesa_vp->Base.NumInstructions-1];
+
+      assert(vpi->Opcode == OPCODE_END);
+
+      mesa_vp->Base.InputsRead |= (1 << VERT_ATTRIB_POS);
+      mesa_vp->Base.OutputsWritten |= (1 << VERT_RESULT_HPOS);
+
+      //fprintf(stderr, "IsPositionInvariant is set!\n");
+      //_mesa_print_program(&mesa_vp->Base);
+   }
+
+   vp->pos_end = 0;
+   mesa_vp->Base.NumNativeInstructions = 0;
+   mesa_vp->Base.NumNativeParameters = mesa_vp->Base.Parameters->NumParameters;
+
+   for(i=0; i < VERT_ATTRIB_MAX; i++)
+      vp->inputs[i] = -1;
+/* fglrx uses fixed inputs as follows for conventional attribs.
+   generic attribs use non-fixed assignment, fglrx will always use the lowest attrib values available.
+   There are 12 generic attribs possible, corresponding to attrib 0, 2-11 and 13 in a hw vertex prog.
+   attr 1 and 12 are not available for generic attribs as those cannot be made vec4 (correspond to
+   vertex normal/weight)
+   attr 0 is pos, R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0
+   attr 2-5 use colors 0-3 (R200_VTX_FP_RGBA << R200_VTX_COLOR_0/1/2/3_SHIFT in R200_SE_VTX_FMT_0)
+   attr 6-11 use tex 0-5 (4 << R200_VTX_TEX0/1/2/3/4/5_COMP_CNT_SHIFT in R200_SE_VTX_FMT_1)
+   attr 13 uses vtx1 pos (R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0)
+   generic attribs would require some more work (dma regions, renaming). */
+
+/* may look different when using idx buf / input_route instead of se_vtx_fmt? */
+   vp->inputs[VERT_ATTRIB_POS] = 0;
+   vp->inputs[VERT_ATTRIB_WEIGHT] = 12;
+   vp->inputs[VERT_ATTRIB_NORMAL] = 1;
+   vp->inputs[VERT_ATTRIB_COLOR0] = 2;
+   vp->inputs[VERT_ATTRIB_COLOR1] = 3;
+   vp->inputs[VERT_ATTRIB_FOG] = 15;
+   vp->inputs[VERT_ATTRIB_TEX0] = 6;
+   vp->inputs[VERT_ATTRIB_TEX1] = 7;
+   vp->inputs[VERT_ATTRIB_TEX2] = 8;
+   vp->inputs[VERT_ATTRIB_TEX3] = 9;
+   vp->inputs[VERT_ATTRIB_TEX4] = 10;
+   vp->inputs[VERT_ATTRIB_TEX5] = 11;
+/* attr 4,5 and 13 are only used with generic attribs.
+   Haven't seen attr 14 used, maybe that's for the hw pointsize vec1 (which is
+   not possibe to use with vertex progs as it is lacking in vert prog specification) */
+
+   assert(mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_HPOS));
+
+   vp->translated = GL_TRUE;
+
+   o_inst = vp->instr;
+   for(vpi = mesa_vp->Base.Instructions; vpi->Opcode != OPCODE_END; vpi++, o_inst++){
+      if (u_temp_i < mesa_vp->Base.NumTemporaries) {
+	 if (R200_DEBUG & DEBUG_FALLBACKS) {
+	    fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->Base.NumTemporaries, u_temp_i);
+	 }
+	 return GL_FALSE;
+      }
+      u_temp_i = R200_VSF_MAX_TEMPS - 1;
+      if(o_inst - vp->instr >= R200_VSF_MAX_INST) {
+	 mesa_vp->Base.NumNativeInstructions = 129;
+	 if (R200_DEBUG & DEBUG_FALLBACKS) {
+	    fprintf(stderr, "more than 128 native instructions\n");
+	 }
+	 return GL_FALSE;
+      }
+
+      operands = op_operands(vpi->Opcode);
+      are_srcs_scalar = operands & SCALAR_FLAG;
+      operands &= OP_MASK;
+
+      for(i=0; i < operands; i++)
+	 src[i] = vpi->SrcReg[i];
+
+      if(operands == 3){ /* TODO: scalars */
+	 if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
+		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_ALL);
+
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
+		  SWIZZLE_X, SWIZZLE_Y,
+		  SWIZZLE_Z, SWIZZLE_W,
+		  t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4);
+
+	    o_inst->src1 = ZERO_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+
+	    src[2].File = PROGRAM_TEMPORARY;
+	    src[2].Index = u_temp_i;
+	    src[2].RelAddr = 0;
+	    u_temp_i--;
+	 }
+      }
+
+      if(operands >= 2){
+	 if( CMP_SRCS(src[1], src[0]) ){
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
+		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_ALL);
+
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		  SWIZZLE_X, SWIZZLE_Y,
+		  SWIZZLE_Z, SWIZZLE_W,
+		  t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+
+	    o_inst->src1 = ZERO_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+
+	    src[0].File = PROGRAM_TEMPORARY;
+	    src[0].Index = u_temp_i;
+	    src[0].RelAddr = 0;
+	    u_temp_i--;
+	 }
+      }
+
+      /* These ops need special handling. */
+      switch(vpi->Opcode){
+      /* FIXME: ARL works fine, but negative offsets won't work - fglrx just sems to ignore neg offsets
+	 which isn't quite correct... */
+      case OPCODE_ARL:
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ARL, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+	 o_inst->src0 = t_src_scalar(vp, &src[0]);
+	 o_inst->src1 = UNUSED_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_POW:
+/* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter).
+   So may need to insert additional instruction */
+/* this appears to be different to r300 */
+	 if ((src[0].File == src[1].File) &&
+	     (src[0].Index == src[1].Index)) {
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&vpi->DstReg),
+		   t_dst_mask(vpi->DstReg.WriteMask));
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		   SWIZZLE_ZERO,
+		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+		   SWIZZLE_ZERO,
+		   t_src_class(src[0].File),
+		   src[0].NegateBase) | (src[0].RelAddr << 4);
+	    o_inst->src1 = UNUSED_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_0;
+	 }
+	 else {
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
+		   (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		   VSF_FLAG_ALL);
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
+		   t_src_class(src[0].File),
+		   src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		   SWIZZLE_ZERO, SWIZZLE_ZERO,
+		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
+		   t_src_class(src[1].File),
+		   src[1].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&vpi->DstReg),
+		   t_dst_mask(vpi->DstReg.WriteMask));
+	    o_inst->src0 = MAKE_VSF_SOURCE(u_temp_i,
+		   VSF_IN_COMPONENT_X,
+		   VSF_IN_COMPONENT_Y,
+		   VSF_IN_COMPONENT_Z,
+		   VSF_IN_COMPONENT_W,
+		   VSF_IN_CLASS_TMP,
+		   VSF_FLAG_NONE);
+	    o_inst->src1 = UNUSED_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_0;
+	    u_temp_i--;
+	 }
+	 goto next;
+
+      case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO} 
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = ZERO_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+      case OPCODE_MAD:
+	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
+	    src[1].File == PROGRAM_TEMPORARY &&
+	    src[2].File == PROGRAM_TEMPORARY) ? R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
+
+	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&vpi->DstReg),
+	    t_dst_mask(vpi->DstReg.WriteMask));
+	 o_inst->src0 = t_src(vp, &src[0]);
+#if 0
+if ((o_inst - vp->instr) == 31) {
+/* fix up the broken vertex program of quake4 demo... */
+o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
+			t_src_class(src[1].File),
+			src[1].NegateBase) | (src[1].RelAddr << 4);
+o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
+			t_src_class(src[1].File),
+			src[1].NegateBase) | (src[1].RelAddr << 4);
+}
+else {
+	 o_inst->src1 = t_src(vp, &src[1]);
+	 o_inst->src2 = t_src(vp, &src[2]);
+}
+#else
+	 o_inst->src1 = t_src(vp, &src[1]);
+	 o_inst->src2 = t_src(vp, &src[2]);
+#endif
+	 goto next;
+
+      case OPCODE_DP3://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO} 
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+		SWIZZLE_ZERO,
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
+		SWIZZLE_ZERO,
+		t_src_class(src[1].File),
+		src[1].NegateBase) | (src[1].RelAddr << 4);
+
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
+		t_src_class(src[1].File),
+		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_ABS://MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+	 o_inst->op=MAKE_VSF_OP(R200_VPI_OUT_OP_MAX, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0=t_src(vp, &src[0]);
+	 o_inst->src1=MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
+		t_src_class(src[0].File),
+		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_FLR:
+      /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W} 
+         ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_FRC,
+	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+	    t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = UNUSED_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_1;
+	 o_inst++;
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = MAKE_VSF_SOURCE(u_temp_i,
+		VSF_IN_COMPONENT_X,
+		VSF_IN_COMPONENT_Y,
+		VSF_IN_COMPONENT_Z,
+		VSF_IN_COMPONENT_W,
+		VSF_IN_CLASS_TMP,
+		/* Not 100% sure about this */
+		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
+
+	 o_inst->src2 = UNUSED_SRC_0;
+	 u_temp_i--;
+	 goto next;
+
+      case OPCODE_LG2:// LG2 RESULT 1.X Y Z W PARAM 0{} {X X X X}
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_LG2, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_src_class(src[0].File),
+		src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+	 o_inst->src1 = UNUSED_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_0;
+	 goto next;
+
+      case OPCODE_LIT://LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} 
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_LIT, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+/* r200 in contrast to r300 does not seem to need any complicated setup,
+   its LIT instruction is "more native" */
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = UNUSED_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_0;
+	 goto next;
+
+      case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W} 
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+		VSF_IN_COMPONENT_ONE,
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+	 o_inst->src1 = t_src(vp, &src[1]);
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_XPD:
+	 /* mul r0, r1.yzxw, r2.zxyw
+	    mad r0, -r2.yzxw, r1.zxyw, r0
+	    NOTE: might need MAD_2
+	  */
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
+	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+	    t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
+		t_src_class(src[1].File),
+		src[1].NegateBase) | (src[1].RelAddr << 4);
+
+	 o_inst->src2 = UNUSED_SRC_1;
+	 o_inst++;
+	 u_temp_i--;
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MAD, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
+		t_src_class(src[1].File),
+		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+
+	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
+		VSF_IN_COMPONENT_X,
+		VSF_IN_COMPONENT_Y,
+		VSF_IN_COMPONENT_Z,
+		VSF_IN_COMPONENT_W,
+		VSF_IN_CLASS_TMP,
+		VSF_FLAG_NONE);
+
+	 goto next;
+
+      case OPCODE_SWZ:
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = ZERO_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_RCC:
+	 if (R200_DEBUG & DEBUG_FALLBACKS) {
+	    fprintf(stderr, "Don't know how to handle op %d yet\n", vpi->Opcode);
+	 }
+	 return GL_FALSE;
+      break;
+      case OPCODE_END:
+	 break;
+      default:
+	 break;
+      }
+
+      o_inst->op = MAKE_VSF_OP(t_opcode(vpi->Opcode), t_dst(&vpi->DstReg),
+	    t_dst_mask(vpi->DstReg.WriteMask));
+
+      if(are_srcs_scalar){
+	 switch(operands){
+	    case 1:
+		o_inst->src0 = t_src_scalar(vp, &src[0]);
+		o_inst->src1 = UNUSED_SRC_0;
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 2:
+		o_inst->src0 = t_src_scalar(vp, &src[0]);
+		o_inst->src1 = t_src_scalar(vp, &src[1]);
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 3:
+		o_inst->src0 = t_src_scalar(vp, &src[0]);
+		o_inst->src1 = t_src_scalar(vp, &src[1]);
+		o_inst->src2 = t_src_scalar(vp, &src[2]);
+	    break;
+
+	    default:
+		fprintf(stderr, "illegal number of operands %lu\n", operands);
+		exit(-1);
+	    break;
+	 }
+      } else {
+	 switch(operands){
+	    case 1:
+		o_inst->src0 = t_src(vp, &src[0]);
+		o_inst->src1 = UNUSED_SRC_0;
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 2:
+		o_inst->src0 = t_src(vp, &src[0]);
+		o_inst->src1 = t_src(vp, &src[1]);
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 3:
+		o_inst->src0 = t_src(vp, &src[0]);
+		o_inst->src1 = t_src(vp, &src[1]);
+		o_inst->src2 = t_src(vp, &src[2]);
+	    break;
+
+	    default:
+		fprintf(stderr, "illegal number of operands %lu\n", operands);
+		exit(-1);
+	    break;
+	 }
+      }
+      next:
+      if ((o_inst->op & R200_VSF_OUT_CLASS_MASK) == R200_VSF_OUT_CLASS_RESULT_POS) {
+	 vp->pos_end = (o_inst - vp->instr);
+      }
+   }
+
+   /* need to test again since some instructions require more than one (up to 3) native inst */
+   if(o_inst - vp->instr > R200_VSF_MAX_INST) {
+      mesa_vp->Base.NumNativeInstructions = 129;
+      if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 fprintf(stderr, "more than 128 native instructions\n");
+      }
+      return GL_FALSE;
+   }
+   vp->native = GL_TRUE;
+   mesa_vp->Base.NumNativeInstructions = (o_inst - vp->instr);
+#if 0
+   fprintf(stderr, "hw program:\n");
+   for(i=0; i < vp->program.length; i++)
+      fprintf(stderr, "%08x\n", vp->instr[i]);
+#endif
+   return GL_TRUE;
+}
+
+void r200SetupVertexProg( GLcontext *ctx ) {
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct r200_vertex_program *vp = (struct r200_vertex_program *)ctx->VertexProgram.Current;
+   GLboolean fallback;
+   GLint i;
+
+   if (!vp->translated) {
+      rmesa->curr_vp_hw = NULL;
+      r200_translate_vertex_program(vp);
+   }
+   /* could optimize setting up vertex progs away for non-tcl hw */
+   fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp) &&
+      rmesa->r200Screen->drmSupportsVertexProgram);
+   TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
+   if (fallback) return;
+
+   R200_STATECHANGE( rmesa, pvs );
+
+   rmesa->hw.pvs.cmd[PVS_CNTL_1] = (0 << R200_PVS_CNTL_1_PROGRAM_START_SHIFT) |
+      ((vp->mesa_program.Base.NumNativeInstructions - 1) << R200_PVS_CNTL_1_PROGRAM_END_SHIFT) |
+      (vp->pos_end << R200_PVS_CNTL_1_POS_END_SHIFT);
+   rmesa->hw.pvs.cmd[PVS_CNTL_2] = (0 << R200_PVS_CNTL_2_PARAM_OFFSET_SHIFT) |
+      (vp->mesa_program.Base.NumNativeParameters << R200_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+
+   /* maybe user clip planes just work with vertex progs... untested */
+   if (ctx->Transform.ClipPlanesEnabled) {
+      R200_STATECHANGE( rmesa, tcl );
+      if (vp->mesa_program.IsPositionInvariant) {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (ctx->Transform.ClipPlanesEnabled << 2);
+      }
+      else {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(0xfc);
+      }
+   }
+
+   if (vp != rmesa->curr_vp_hw) {
+      GLuint count = vp->mesa_program.Base.NumNativeInstructions;
+      drm_radeon_cmd_header_t tmp;
+
+      R200_STATECHANGE( rmesa, vpi[0] );
+      R200_STATECHANGE( rmesa, vpi[1] );
+
+      /* FIXME: what about using a memcopy... */
+      for (i = 0; (i < 64) && i < count; i++) {
+	 rmesa->hw.vpi[0].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i].op;
+	 rmesa->hw.vpi[0].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i].src0;
+	 rmesa->hw.vpi[0].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i].src1;
+	 rmesa->hw.vpi[0].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i].src2;
+      }
+      /* hack up the cmd_size so not the whole state atom is emitted always.
+         This may require some more thought, we may emit half progs on lost state, but
+         hopefully it won't matter?
+         WARNING: must not use R200_DB_STATECHANGE, this will produce bogus (and rejected)
+         packet emits (due to the mismatched cmd_size and count in cmd/last_cmd) */
+      rmesa->hw.vpi[0].cmd_size = 1 + 4 * ((count > 64) ? 64 : count);
+      tmp.i = rmesa->hw.vpi[0].cmd[VPI_CMD_0];
+      tmp.veclinear.count = (count > 64) ? 64 : count;
+      rmesa->hw.vpi[0].cmd[VPI_CMD_0] = tmp.i;
+      if (count > 64) {
+	 for (i = 0; i < (count - 64); i++) {
+	    rmesa->hw.vpi[1].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i + 64].op;
+	    rmesa->hw.vpi[1].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i + 64].src0;
+	    rmesa->hw.vpi[1].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i + 64].src1;
+	    rmesa->hw.vpi[1].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i + 64].src2;
+	 }
+	 rmesa->hw.vpi[1].cmd_size = 1 + 4 * (count - 64);
+	 tmp.i = rmesa->hw.vpi[1].cmd[VPI_CMD_0];
+	 tmp.veclinear.count = count - 64;
+	 rmesa->hw.vpi[1].cmd[VPI_CMD_0] = tmp.i;
+      }
+      rmesa->curr_vp_hw = vp;
+   }
+}
+
+
+static void r200BindProgram(GLcontext *ctx, GLenum target, struct program *prog)
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   switch(target){
+   case GL_VERTEX_PROGRAM_ARB:
+      rmesa->curr_vp_hw = NULL;
+      break;
+   default:
+      _mesa_problem(ctx, "Target not supported yet!");
+      break;
+   }
+}
+
+static struct program *r200NewProgram(GLcontext *ctx, GLenum target, GLuint id)
+{
+   struct r200_vertex_program *vp;
+
+   switch(target){
+   case GL_VERTEX_PROGRAM_ARB:
+      vp = CALLOC_STRUCT(r200_vertex_program);
+      return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id);
+   case GL_FRAGMENT_PROGRAM_ARB:
+   case GL_FRAGMENT_PROGRAM_NV:
+      return _mesa_init_fragment_program( ctx, CALLOC_STRUCT(fragment_program), target, id );
+   default:
+      _mesa_problem(ctx, "Bad target in r200NewProgram");
+   }
+   return NULL;	
+}
+
+
+static void r200DeleteProgram(GLcontext *ctx, struct program *prog)
+{
+   _mesa_delete_program(ctx, prog);
+}
+
+static void r200ProgramStringNotify(GLcontext *ctx, GLenum target, struct program *prog)
+{
+   struct r200_vertex_program *vp = (void *)prog;
+
+   switch(target) {
+   case GL_VERTEX_PROGRAM_ARB:
+      vp->translated = GL_FALSE;
+      memset(&vp->translated, 0, sizeof(struct r200_vertex_program) - sizeof(struct vertex_program));
+      /*r200_translate_vertex_shader(vp);*/
+      break;
+   }
+   /* need this for tcl fallbacks */
+   _tnl_program_string(ctx, target, prog);
+}
+
+static GLboolean r200IsProgramNative(GLcontext *ctx, GLenum target, struct program *prog)
+{
+   struct r200_vertex_program *vp = (void *)prog;
+
+   switch(target){
+   case GL_VERTEX_STATE_PROGRAM_NV:
+   case GL_VERTEX_PROGRAM_ARB:
+      if (!vp->translated) {
+	 r200_translate_vertex_program(vp);
+      }
+     /* does not take parameters etc. into account */
+      return vp->native;
+   default:
+      _mesa_problem(ctx, "Bad target in r200NewProgram");
+   }
+   return 0;
+}
+
+void r200InitShaderFuncs(struct dd_function_table *functions)
+{
+   functions->NewProgram = r200NewProgram;
+   functions->BindProgram = r200BindProgram;
+   functions->DeleteProgram = r200DeleteProgram;
+   functions->ProgramStringNotify = r200ProgramStringNotify;
+   functions->IsProgramNative = r200IsProgramNative;
+}
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.h b/src/mesa/drivers/dri/r200/r200_vertprog.h
new file mode 100644
index 00000000000..00ad2dd1b30
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.h
@@ -0,0 +1,160 @@
+#ifndef __VERTEX_SHADER_H__
+#define __VERTEX_SHADER_H__
+
+#include "r200_reg.h"
+
+typedef struct {
+   uint32_t op;
+   uint32_t src0;
+   uint32_t src1;
+   uint32_t src2;
+} VERTEX_SHADER_INSTRUCTION;
+
+extern void r200InitShaderFuncs(struct dd_function_table *functions);
+extern void r200SetupVertexProg( GLcontext *ctx );
+
+#define VSF_FLAG_X	1
+#define VSF_FLAG_Y	2
+#define VSF_FLAG_Z	4
+#define VSF_FLAG_W	8
+#define VSF_FLAG_XYZ	(VSF_FLAG_X | VSF_FLAG_Y | VSF_FLAG_Z)
+#define VSF_FLAG_ALL	0xf
+#define VSF_FLAG_NONE	0
+
+#define R200_VSF_MAX_INST	128
+#define R200_VSF_MAX_PARAM	192
+#define R200_VSF_MAX_TEMPS	12
+
+#define R200_VPI_OUT_REG_INDEX_SHIFT            13
+#define R200_VPI_OUT_REG_INDEX_MASK             (31 << 13) /* GUESS based on fglrx native limits */
+
+#define R200_VPI_OUT_WRITE_X                    (1 << 20)
+#define R200_VPI_OUT_WRITE_Y                    (1 << 21)
+#define R200_VPI_OUT_WRITE_Z                    (1 << 22)
+#define R200_VPI_OUT_WRITE_W                    (1 << 23)
+
+#define R200_VPI_IN_REG_CLASS_TEMPORARY         (0 << 0)
+#define R200_VPI_IN_REG_CLASS_ATTRIBUTE         (1 << 0)
+#define R200_VPI_IN_REG_CLASS_PARAMETER         (2 << 0)
+#define R200_VPI_IN_REG_CLASS_NONE              (9 << 0)
+#define R200_VPI_IN_REG_CLASS_MASK              (31 << 0) /* GUESS */
+
+#define R200_VPI_IN_REG_INDEX_SHIFT             5
+#define R200_VPI_IN_REG_INDEX_MASK              (255 << 5) /* GUESS based on fglrx native limits */
+
+/* The R200 can select components from the input register arbitrarily.
+// Use the following constants, shifted by the component shift you
+// want to select */
+#define R200_VPI_IN_SELECT_X    0
+#define R200_VPI_IN_SELECT_Y    1
+#define R200_VPI_IN_SELECT_Z    2
+#define R200_VPI_IN_SELECT_W    3
+#define R200_VPI_IN_SELECT_ZERO 4
+#define R200_VPI_IN_SELECT_ONE  5
+#define R200_VPI_IN_SELECT_MASK 7
+
+#define R200_VPI_IN_X_SHIFT                     13
+#define R200_VPI_IN_Y_SHIFT                     16
+#define R200_VPI_IN_Z_SHIFT                     19
+#define R200_VPI_IN_W_SHIFT                     22
+
+#define R200_VPI_IN_NEG_X                       (1 << 25)
+#define R200_VPI_IN_NEG_Y                       (1 << 26)
+#define R200_VPI_IN_NEG_Z                       (1 << 27)
+#define R200_VPI_IN_NEG_W                       (1 << 28)
+
+#define R200_VSF_OUT_CLASS_TMP			(0 << 8)
+#define R200_VSF_OUT_CLASS_ADDR			(3 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_POS		(4 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_COLOR		(5 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_TEXC		(6 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_FOGC		(7 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_POINTSIZE	(8 << 8)
+#define R200_VSF_OUT_CLASS_MASK			(31 << 8)
+
+/* opcodes - they all are the same as on r300 it seems */
+#define R200_VPI_OUT_OP_DOT                     (1 << 0)
+#define R200_VPI_OUT_OP_MUL                     (2 << 0)
+#define R200_VPI_OUT_OP_ADD                     (3 << 0)
+#define R200_VPI_OUT_OP_MAD                     (4 << 0)
+#define R200_VPI_OUT_OP_DST                     (5 << 0)
+#define R200_VPI_OUT_OP_FRC                     (6 << 0)
+#define R200_VPI_OUT_OP_MAX                     (7 << 0)
+#define R200_VPI_OUT_OP_MIN                     (8 << 0)
+#define R200_VPI_OUT_OP_SGE                     (9 << 0)
+#define R200_VPI_OUT_OP_SLT                     (10 << 0)
+
+#define R200_VPI_OUT_OP_ARL                     (13 << 0)
+
+#define R200_VPI_OUT_OP_EXP                     (65 << 0)
+#define R200_VPI_OUT_OP_LOG                     (66 << 0)
+
+#define R200_VPI_OUT_OP_LIT                     (68 << 0)
+#define R200_VPI_OUT_OP_POW                     (69 << 0)
+#define R200_VPI_OUT_OP_RCP                     (70 << 0)
+#define R200_VPI_OUT_OP_RSQ                     (72 << 0)
+
+#define R200_VPI_OUT_OP_EX2                     (75 << 0)
+#define R200_VPI_OUT_OP_LG2                     (76 << 0)
+
+#define R200_VPI_OUT_OP_MAD_2                   (128 << 0)
+
+/* first CARD32 of an instruction */
+
+/* possible operations: 
+    DOT, MUL, ADD, MAD, FRC, MAX, MIN, SGE, SLT, EXP, LOG, LIT, POW, RCP, RSQ, EX2,
+    LG2, MAD_2, ARL */
+
+#define MAKE_VSF_OP(op, out_reg, out_reg_fields) \
+   ((op) | (out_reg) | ((out_reg_fields) << 20) )
+
+#define VSF_IN_CLASS_TMP	0
+#define VSF_IN_CLASS_ATTR	1
+#define VSF_IN_CLASS_PARAM	2
+#define VSF_IN_CLASS_NONE	9
+
+#define VSF_IN_COMPONENT_X	0
+#define VSF_IN_COMPONENT_Y	1
+#define VSF_IN_COMPONENT_Z	2
+#define VSF_IN_COMPONENT_W	3
+#define VSF_IN_COMPONENT_ZERO	4
+#define VSF_IN_COMPONENT_ONE	5
+
+#define MAKE_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	( ((in_reg_index)<<R200_VPI_IN_REG_INDEX_SHIFT) \
+	   | ((comp_x)<<R200_VPI_IN_X_SHIFT) \
+	   | ((comp_y)<<R200_VPI_IN_Y_SHIFT) \
+	   | ((comp_z)<<R200_VPI_IN_Z_SHIFT) \
+	   | ((comp_w)<<R200_VPI_IN_W_SHIFT) \
+	   | ((negate)<<25) | ((class)))
+
+#define EASY_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	MAKE_VSF_SOURCE(in_reg_index, \
+		VSF_IN_COMPONENT_##comp_x, \
+		VSF_IN_COMPONENT_##comp_y, \
+		VSF_IN_COMPONENT_##comp_z, \
+		VSF_IN_COMPONENT_##comp_w, \
+		VSF_IN_CLASS_##class, VSF_FLAG_##negate)
+
+/* special sources: */
+
+/* (1.0,1.0,1.0,1.0) vector (ATTR, plain ) */
+#define VSF_ATTR_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, ATTR, NONE)
+#define VSF_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, NONE, NONE)
+
+/* contents of unmodified register */
+#define VSF_REG(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, ATTR, NONE)
+
+/* contents of unmodified parameter */
+#define VSF_PARAM(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, PARAM, NONE)
+
+/* contents of unmodified temporary register */
+#define VSF_TMP(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, TMP, NONE)
+
+/* components of ATTR register */
+#define VSF_ATTR_X(reg) EASY_VSF_SOURCE(reg, X, X, X, X, ATTR, NONE)
+#define VSF_ATTR_Y(reg) EASY_VSF_SOURCE(reg, Y, Y, Y, Y, ATTR, NONE)
+#define VSF_ATTR_Z(reg) EASY_VSF_SOURCE(reg, Z, Z, Z, Z, ATTR, NONE)
+#define VSF_ATTR_W(reg) EASY_VSF_SOURCE(reg, W, W, W, W, ATTR, NONE)
+
+#endif
diff --git a/src/mesa/drivers/dri/r200/r200_vtxfmt.c b/src/mesa/drivers/dri/r200/r200_vtxfmt.c
index 673076d0605..8480ce73e88 100644
--- a/src/mesa/drivers/dri/r200/r200_vtxfmt.c
+++ b/src/mesa/drivers/dri/r200/r200_vtxfmt.c
@@ -663,7 +663,9 @@ static GLboolean check_vtx_fmt( GLcontext *ctx )
    GLuint count[R200_MAX_TEXTURE_UNITS];
 
    if (rmesa->TclFallback || rmesa->vb.fell_back || ctx->CompileFlag ||
-      (ctx->Fog.Enabled && (ctx->Fog.FogCoordinateSource == GL_FOG_COORD)))
+      (ctx->Fog.Enabled && (ctx->Fog.FogCoordinateSource == GL_FOG_COORD)) ||
+      /* TODO: set tcl out fmt/compsel and reenable vtxfmt code */
+      ctx->VertexProgram._Enabled)
       return GL_FALSE;
 
    if (ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT)