From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Jun 2005 10:59:37 +0000 (+0000)
Subject: Simplify interpreted language:
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=757e0855adb1b1eb45b55e1fcf6acb47224b2853;p=mesa.git

Simplify interpreted language:

- Expand operand argument specifiers so that all registers and
  parameters can be referenced directly.  Remove old PAR/PRL
  instructions.

- No 3 operand instructions, translate MAD -> MUL, ADD.  (No room for
  3 full operands in instruction struct).

- Translate SWZ instructions into 1 or 2 reduced swizzles at compile
  time.

- Add hardwired code for moving input and output values to/from the
  register file.  Drop old INx, OUT instructions.
---

diff --git a/src/mesa/tnl/t_vb_arbprogram.c b/src/mesa/tnl/t_vb_arbprogram.c
index 3be82c72985..5494eed0984 100644
--- a/src/mesa/tnl/t_vb_arbprogram.c
+++ b/src/mesa/tnl/t_vb_arbprogram.c
@@ -49,118 +49,87 @@
 
 /* New, internal instructions:
  */
-#define IN1        (VP_OPCODE_XPD+1)
-#define IN2        (IN1+1)	/* intput-to-reg MOV */
-#define IN3        (IN1+2)
-#define IN4        (IN1+3)
-#define OUT        (IN1+4)	/* reg-to-output MOV */
-#define OUM        (IN1+5)	/* reg-to-output MOV with mask */
-#define RSW        (IN1+6)
-#define MSK        (IN1+7)	/* reg-to-reg MOV with mask */
-#define PAR        (IN1+8)      /* parameter-to-reg MOV */
-#define PRL        (IN1+9)      /* parameter-to-reg MOV */
+#define RSW        (VP_MAX_OPCODE)
+#define SEL        (VP_MAX_OPCODE+1)
+#define REL        (VP_MAX_OPCODE+2)
 
 
 /* Layout of register file:
 
   0 -- Scratch (Arg0)
   1 -- Scratch (Arg1)
-  2 -- Scratch (Arg2)
-  3 -- Scratch (Result)
+  2 -- Scratch (Result)
   4 -- Program Temporary 0
+  16 -- Program Temporary 12 (max for NV_VERTEX_PROGRAM)
+  17 -- Output 0
+  31 -- Output 15 (max for NV_VERTEX_PROGRAM) (Last writeable register)
+  32 -- Parameter 0
   ..
-  31 -- Program Temporary 27
-  32 -- State/Input/Const shadow 0
-  ..
-  63 -- State/Input/Const shadow 31
+  127 -- Parameter 63 (max for NV_VERTEX_PROGRAM)
 
 */
 
-
-
-#define REG_ARG0  0
-#define REG_ARG1  1
-#define REG_ARG2  2
-#define REG_RES   3
-#define REG_TMP0  4
-#define REG_TMP_MAX 32
-#define REG_TMP_NR (REG_TMP_MAX-REG_TMP0)
-#define REG_PAR0  32
-#define REG_PAR_MAX 64
-#define REG_PAR_NR (REG_PAR_MAX-REG_PAR0)
-
-#define REG_MAX 64
-#define REG_SWZDST_MAX 16
+#define FILE_REG         0
+#define FILE_LOCAL_PARAM 1
+#define FILE_ENV_PARAM   2
+#define FILE_STATE_PARAM 3
+
+
+#define REG_ARG0   0
+#define REG_ARG1   1
+#define REG_ARG2   2
+#define REG_RES    3
+#define REG_ADDR   4
+#define REG_TMP0   5
+#define REG_TMP11  16
+#define REG_OUT0   17
+#define REG_OUT14  31
+#define REG_IN0    32
+#define REG_IN15   47
+#define REG_ID     48		/* 0,0,0,1 */
+#define REG_MAX    128
+#define REG_INVALID ~0
 
 /* ARB_vp instructions are broken down into one or more of the
  * following micro-instructions, each representable in a 32 bit packed
  * structure.
  */
 
-
-union instruction {
-   struct {
-      GLuint opcode:6;
-      GLuint dst:5;
-      GLuint arg0:6;
-      GLuint arg1:6;
-      GLuint elt:2;		/* x,y,z or w */
-      GLuint pad:7;
-   } scl;
+struct reg {
+   GLuint file:2;
+   GLuint idx:7;
+};
 
 
+union instruction {
    struct {
       GLuint opcode:6;
       GLuint dst:5;
-      GLuint arg0:6;
-      GLuint arg1:6;
-      GLuint arg2:6;
+      GLuint file0:2;
+      GLuint idx0:7;
+      GLuint file1:2;
+      GLuint idx1:7;
       GLuint pad:3;
-   } vec;
+   } alu;
 
    struct {
       GLuint opcode:6;
-      GLuint dst:4;		/* NOTE!  REG 0..16 only! */
-      GLuint arg0:6;
-      GLuint neg:4;		
-      GLuint swz:12;		
-   } swz;
-
-   struct {
-      GLuint opcode:6;
-      GLuint dst:6;
-      GLuint arg0:6;
-      GLuint neg:1;		/* 1 bit only */
+      GLuint dst:5;
+      GLuint file0:2;
+      GLuint idx0:7;
+      GLuint neg:4;
       GLuint swz:8;		/* xyzw only */
-      GLuint pad:5;
    } rsw;
 
-   struct {
-      GLuint opcode:6;
-      GLuint reg:6;
-      GLuint file:5;
-      GLuint idx:8;		/* plenty? */
-      GLuint rel:1;
-      GLuint pad:6;
-   } inr;
-
-
-   struct {
-      GLuint opcode:6;
-      GLuint reg:6;
-      GLuint file:5;
-      GLuint idx:8;		/* plenty? */
-      GLuint mask:4;
-      GLuint pad:3;
-   } out;
-
    struct {
       GLuint opcode:6;
       GLuint dst:5;
-      GLuint arg0:6;
+      GLuint idx0:7;		/* note! */
+      GLuint file1:2;
+      GLuint idx1:7;
       GLuint mask:4;
-      GLuint pad:11;
-   } msk;
+      GLuint pad:1;
+   } sel;
 
    GLuint dword;
 };
@@ -168,32 +137,39 @@ union instruction {
 
 
 struct compilation {
-   struct {
-      GLuint file:5;
-      GLuint idx:8; 
-   } reg[REG_PAR_NR];
-
-   GLuint par_active;
-   GLuint par_protected;
-   GLuint tmp_active;
-   
+   GLuint reg_active;
    union instruction *csr;
-
    struct vertex_buffer *VB;	/* for input sizes! */
 };
 
+struct input {
+   GLuint idx;
+   GLfloat *data;
+   GLuint stride;
+   GLuint size;
+};
+
+struct output {
+   GLuint idx;
+   GLfloat *data;
+};
+
 /*--------------------------------------------------------------------------- */
 
 /*!
  * Private storage for the vertex program pipeline stage.
  */
 struct arb_vp_machine {
-   GLfloat reg[REG_MAX][4];	/* Program temporaries, shadowed parameters and inputs,
-				   plus some internal values */
-
-   GLfloat (*File[8])[4];	/* Src/Dest for PAR/PRL instructions. */
+   GLfloat reg[REG_MAX][4];	/* Program temporaries, inputs and outputs */
+   GLfloat (*File[4])[4];	/* All values reference-able from the program. */
    GLint AddressReg;
 
+   struct input input[16];
+   GLuint nr_inputs;
+
+   struct output output[15];
+   GLuint nr_outputs;
+
    union instruction store[1024];
    union instruction *instructions;
    GLint nr_instructions;
@@ -213,10 +189,8 @@ struct arb_vp_machine {
 /*--------------------------------------------------------------------------- */
 
 struct opcode_info {
-   GLuint type;
    GLuint nr_args;
    const char *string;
-   void (*func)( struct arb_vp_machine *, union instruction );
    void (*print)( union instruction , const struct opcode_info * );
 };
 
@@ -272,11 +246,7 @@ static GLfloat RoughApproxPow2(GLfloat t)
 
 static GLfloat RoughApproxPower(GLfloat x, GLfloat y)
 {
-#if 0
-   return (GLfloat) exp(y * log(x));
-#else
    return (GLfloat) _mesa_pow(x, y);
-#endif
 }
 
 
@@ -284,156 +254,50 @@ static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
 
 
 
-
-/**
- * This is probably the least-optimal part of the process, have to
- * multiply out the stride to access each incoming input value.
- */
-static GLfloat *get_input( struct arb_vp_machine *m, GLuint index )
-{
-   return VEC_ELT(m->VB->AttribPtr[index], GLfloat, m->vtx_nr);
-}
+#define GET_RSW(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
 
 
-/**
- * Fetch a 4-element float vector from the given source register.
- * Deal with the possibility that not all elements are present.
- */
-static void do_IN1( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.inr.reg];
-   const GLfloat *src = get_input(m, op.inr.idx);
-
-   result[0] = src[0];
-   result[1] = 0;
-   result[2] = 0;
-   result[3] = 1;
-}
-
-static void do_IN2( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.inr.reg];
-   const GLfloat *src = get_input(m, op.inr.idx);
-   
-   result[0] = src[0];
-   result[1] = src[1];
-   result[2] = 0;
-   result[3] = 1;
-}
-
-static void do_IN3( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.inr.reg];
-   const GLfloat *src = get_input(m, op.inr.idx);
-
-   result[0] = src[0];
-   result[1] = src[1];
-   result[2] = src[2];
-   result[3] = 1;
-}
-
-static void do_IN4( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.inr.reg];
-   const GLfloat *src = get_input(m, op.inr.idx);
-   
-   result[0] = src[0];
-   result[1] = src[1];
-   result[2] = src[2];
-   result[3] = src[3];
-}
-
 /**
  * Perform a reduced swizzle:
  */
 static void do_RSW( struct arb_vp_machine *m, union instruction op ) 
 {
    GLfloat *result = m->reg[op.rsw.dst];
-   const GLfloat *arg0 = m->reg[op.rsw.arg0];
+   const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
    GLuint swz = op.rsw.swz;
    GLuint neg = op.rsw.neg;
-   GLuint i;
-
-   if (neg) 
-      for (i = 0; i < 4; i++, swz >>= 2) 
-	 result[i] = -arg0[swz & 0x3];
-   else
-      for (i = 0; i < 4; i++, swz >>= 2) 
-	 result[i] = arg0[swz & 0x3];
-}
-
-
-
-/**
- * Store 4 floats into an external address.
- */
-static void do_OUM( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *dst = m->attribs[op.out.idx].data[m->vtx_nr];
-   const GLfloat *value = m->reg[op.out.reg];
-
-   if (op.out.mask & 0x1) dst[0] = value[0];
-   if (op.out.mask & 0x2) dst[1] = value[1];
-   if (op.out.mask & 0x4) dst[2] = value[2];
-   if (op.out.mask & 0x8) dst[3] = value[3];
-}
-
-static void do_OUT( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *dst = m->attribs[op.out.idx].data[m->vtx_nr];
-   const GLfloat *value = m->reg[op.out.reg];
 
-   dst[0] = value[0];
-   dst[1] = value[1];
-   dst[2] = value[2];
-   dst[3] = value[3];
+   result[0] = arg0[GET_RSW(swz, 0)];
+   result[1] = arg0[GET_RSW(swz, 1)];
+   result[2] = arg0[GET_RSW(swz, 2)];
+   result[3] = arg0[GET_RSW(swz, 3)];
+   
+   if (neg) {
+      if (neg & 0x1) result[0] = -result[0];
+      if (neg & 0x2) result[1] = -result[1];
+      if (neg & 0x4) result[2] = -result[2];
+      if (neg & 0x8) result[3] = -result[3];
+   }
 }
 
-/* Register-to-register MOV with writemask.
+/* Used to implement write masking
  */
-static void do_MSK( struct arb_vp_machine *m, union instruction op )
+static void do_SEL( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *dst = m->reg[op.msk.dst];
-   const GLfloat *arg0 = m->reg[op.msk.arg0];
+   GLfloat *dst = m->reg[op.sel.dst];
+   const GLfloat *arg0 = m->reg[op.sel.idx0];
+   const GLfloat *arg1 = m->File[op.sel.file1][op.sel.idx1];
  
-   if (op.msk.mask & 0x1) dst[0] = arg0[0];
-   if (op.msk.mask & 0x2) dst[1] = arg0[1];
-   if (op.msk.mask & 0x4) dst[2] = arg0[2];
-   if (op.msk.mask & 0x8) dst[3] = arg0[3];
-}
-
-
-/* Retreive parameters and other constant values:
- */
-static void do_PAR( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.inr.reg];
-   const GLfloat *src = m->File[op.inr.file][op.inr.idx];
-
-   result[0] = src[0];
-   result[1] = src[1];
-   result[2] = src[2];
-   result[3] = src[3];
+   dst[0] = (op.sel.mask & 0x1) ? arg0[0] : arg1[0];
+   dst[1] = (op.sel.mask & 0x2) ? arg0[1] : arg1[1];
+   dst[2] = (op.sel.mask & 0x4) ? arg0[2] : arg1[2];
+   dst[3] = (op.sel.mask & 0x8) ? arg0[3] : arg1[3];
 }
 
 
-#define RELADDR_MASK (MAX_NV_VERTEX_PROGRAM_PARAMS-1)
-
-static void do_PRL( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.inr.reg];
-   GLuint index = (op.inr.idx + m->AddressReg) & RELADDR_MASK;
-   const GLfloat *src = m->File[op.inr.file][index];
-
-   result[0] = src[0];
-   result[1] = src[1];
-   result[2] = src[2];
-   result[3] = src[3];
-}
-
 static void do_PRT( struct arb_vp_machine *m, union instruction op )
 {
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
    
    _mesa_printf("%d: %f %f %f %f\n", m->vtx_nr, 
 		arg0[0], arg0[1], arg0[2], arg0[3]);
@@ -447,8 +311,8 @@ static void do_PRT( struct arb_vp_machine *m, union instruction op )
 
 static void do_ABS( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = (arg0[0] < 0.0) ? -arg0[0] : arg0[0];
    result[1] = (arg0[1] < 0.0) ? -arg0[1] : arg0[1];
@@ -458,9 +322,9 @@ static void do_ABS( struct arb_vp_machine *m, union instruction op )
 
 static void do_ADD( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = arg0[0] + arg1[0];
    result[1] = arg0[1] + arg1[1];
@@ -471,16 +335,16 @@ static void do_ADD( struct arb_vp_machine *m, union instruction op )
 
 static void do_ARL( struct arb_vp_machine *m, union instruction op )
 {
-   const GLfloat *arg0 = m->reg[op.out.reg];
-   m->AddressReg = (GLint) floor(arg0[0]);
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   m->reg[REG_ADDR][0] = FLOORF(arg0[0]);
 }
 
 
 static void do_DP3( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
-   const GLfloat *arg1 = m->reg[op.scl.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (arg0[0] * arg1[0] + 
 		arg0[1] * arg1[1] + 
@@ -489,26 +353,13 @@ static void do_DP3( struct arb_vp_machine *m, union instruction op )
    PUFF(result);
 }
 
-#if 0
-static void do_MAT4( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
-   const GLfloat *mat[] = m->reg[op.scl.arg1];
-
-   result[0] = (arg0[0] * mat0[0] + arg0[1] * mat0[1] + arg0[2] * mat0[2] + arg0[3] * mat0[3]);
-   result[1] = (arg0[0] * mat1[0] + arg0[1] * mat1[1] + arg0[2] * mat1[2] + arg0[3] * mat1[3]);
-   result[2] = (arg0[0] * mat2[0] + arg0[1] * mat2[1] + arg0[2] * mat2[2] + arg0[3] * mat2[3]);
-   result[3] = (arg0[0] * mat3[0] + arg0[1] * mat3[1] + arg0[2] * mat3[2] + arg0[3] * mat3[3]);
-}
-#endif
 
 
 static void do_DP4( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
-   const GLfloat *arg1 = m->reg[op.scl.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (arg0[0] * arg1[0] + 
 		arg0[1] * arg1[1] + 
@@ -520,9 +371,9 @@ static void do_DP4( struct arb_vp_machine *m, union instruction op )
 
 static void do_DPH( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
-   const GLfloat *arg1 = m->reg[op.scl.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (arg0[0] * arg1[0] + 
 		arg0[1] * arg1[1] + 
@@ -534,9 +385,9 @@ static void do_DPH( struct arb_vp_machine *m, union instruction op )
 
 static void do_DST( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = 1.0F;
    result[1] = arg0[1] * arg1[1];
@@ -547,8 +398,8 @@ static void do_DST( struct arb_vp_machine *m, union instruction op )
 
 static void do_EX2( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = (GLfloat)RoughApproxPow2(arg0[0]);
    PUFF(result);
@@ -556,8 +407,8 @@ static void do_EX2( struct arb_vp_machine *m, union instruction op )
 
 static void do_EXP( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
    GLfloat tmp = arg0[0];
    GLfloat flr_tmp = FLOORF(tmp);
 
@@ -572,8 +423,8 @@ static void do_EXP( struct arb_vp_machine *m, union instruction op )
 
 static void do_FLR( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = FLOORF(arg0[0]);
    result[1] = FLOORF(arg0[1]);
@@ -583,8 +434,8 @@ static void do_FLR( struct arb_vp_machine *m, union instruction op )
 
 static void do_FRC( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = arg0[0] - FLOORF(arg0[0]);
    result[1] = arg0[1] - FLOORF(arg0[1]);
@@ -594,8 +445,8 @@ static void do_FRC( struct arb_vp_machine *m, union instruction op )
 
 static void do_LG2( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = RoughApproxLog2(arg0[0]);
    PUFF(result);
@@ -605,8 +456,8 @@ static void do_LG2( struct arb_vp_machine *m, union instruction op )
 
 static void do_LIT( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    const GLfloat epsilon = 1.0F / 256.0F; /* per NV spec */
    GLfloat tmp[4];
@@ -624,8 +475,8 @@ static void do_LIT( struct arb_vp_machine *m, union instruction op )
 
 static void do_LOG( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
    GLfloat tmp = FABSF(arg0[0]);
    int exponent;
    GLfloat mantissa = FREXPF(tmp, &exponent);
@@ -636,25 +487,11 @@ static void do_LOG( struct arb_vp_machine *m, union instruction op )
    result[3] = 1.0;
 }
 
-
-static void do_MAD( struct arb_vp_machine *m, union instruction op )
-{
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
-   const GLfloat *arg2 = m->reg[op.vec.arg2];
-
-   result[0] = arg0[0] * arg1[0] + arg2[0];
-   result[1] = arg0[1] * arg1[1] + arg2[1];
-   result[2] = arg0[2] * arg1[2] + arg2[2];
-   result[3] = arg0[3] * arg1[3] + arg2[3];
-}
-
 static void do_MAX( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (arg0[0] > arg1[0]) ? arg0[0] : arg1[0];
    result[1] = (arg0[1] > arg1[1]) ? arg0[1] : arg1[1];
@@ -665,9 +502,9 @@ static void do_MAX( struct arb_vp_machine *m, union instruction op )
 
 static void do_MIN( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (arg0[0] < arg1[0]) ? arg0[0] : arg1[0];
    result[1] = (arg0[1] < arg1[1]) ? arg0[1] : arg1[1];
@@ -677,8 +514,8 @@ static void do_MIN( struct arb_vp_machine *m, union instruction op )
 
 static void do_MOV( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = arg0[0];
    result[1] = arg0[1];
@@ -688,9 +525,9 @@ static void do_MOV( struct arb_vp_machine *m, union instruction op )
 
 static void do_MUL( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = arg0[0] * arg1[0];
    result[1] = arg0[1] * arg1[1];
@@ -701,18 +538,30 @@ static void do_MUL( struct arb_vp_machine *m, union instruction op )
 
 static void do_POW( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
-   const GLfloat *arg1 = m->reg[op.scl.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (GLfloat)RoughApproxPower(arg0[0], arg1[0]);
    PUFF(result);
 }
 
+static void do_REL( struct arb_vp_machine *m, union instruction op )
+{
+   GLfloat *result = m->reg[op.alu.dst];
+   GLuint idx = (op.alu.idx0 + (GLint)m->reg[REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1);
+   const GLfloat *arg0 = m->File[op.alu.file0][idx];
+
+   result[0] = arg0[0];
+   result[1] = arg0[1];
+   result[2] = arg0[2];
+   result[3] = arg0[3];
+}
+
 static void do_RCP( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = 1.0F / arg0[0];  
    PUFF(result);
@@ -720,8 +569,8 @@ static void do_RCP( struct arb_vp_machine *m, union instruction op )
 
 static void do_RSQ( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.scl.dst];
-   const GLfloat *arg0 = m->reg[op.scl.arg0];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 
    result[0] = INV_SQRTF(FABSF(arg0[0]));
    PUFF(result);
@@ -730,9 +579,9 @@ static void do_RSQ( struct arb_vp_machine *m, union instruction op )
 
 static void do_SGE( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (arg0[0] >= arg1[0]) ? 1.0F : 0.0F;
    result[1] = (arg0[1] >= arg1[1]) ? 1.0F : 0.0F;
@@ -743,9 +592,9 @@ static void do_SGE( struct arb_vp_machine *m, union instruction op )
 
 static void do_SLT( struct arb_vp_machine *m, union instruction op )
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = (arg0[0] < arg1[0]) ? 1.0F : 0.0F;
    result[1] = (arg0[1] < arg1[1]) ? 1.0F : 0.0F;
@@ -753,29 +602,11 @@ static void do_SLT( struct arb_vp_machine *m, union instruction op )
    result[3] = (arg0[3] < arg1[3]) ? 1.0F : 0.0F;
 }
 
-static void do_SWZ( struct arb_vp_machine *m, union instruction op ) 
-{
-   GLfloat *result = m->reg[op.swz.dst];
-   const GLfloat *arg0 = m->reg[op.swz.arg0];
-   GLuint swz = op.swz.swz;
-   GLuint neg = op.swz.neg;
-   GLuint i;
-
-   for (i = 0; i < 4; i++, swz >>= 3, neg >>= 1) {
-      switch (swz & 0x7) {
-      case SWIZZLE_ZERO: result[i] = 0.0; break;
-      case SWIZZLE_ONE:  result[i] = 1.0; break;
-      default:           result[i] = arg0[swz & 0x7]; break;
-      }
-      if (neg & 0x1)     result[i] = -result[i];
-   }
-}
-
 static void do_SUB( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = arg0[0] - arg1[0];
    result[1] = arg0[1] - arg1[1];
@@ -786,9 +617,9 @@ static void do_SUB( struct arb_vp_machine *m, union instruction op )
 
 static void do_XPD( struct arb_vp_machine *m, union instruction op ) 
 {
-   GLfloat *result = m->reg[op.vec.dst];
-   const GLfloat *arg0 = m->reg[op.vec.arg0];
-   const GLfloat *arg1 = m->reg[op.vec.arg1];
+   GLfloat *result = m->reg[op.alu.dst];
+   const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
+   const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 
    result[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1];
    result[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2];
@@ -801,20 +632,6 @@ static void do_NOP( struct arb_vp_machine *m, union instruction op )
 
 /* Some useful debugging functions:
  */
-static void print_reg( GLuint reg )
-{
-   if (reg == REG_RES) 
-      _mesa_printf("RES");
-   else if (reg >= REG_ARG0 && reg <= REG_ARG2)
-      _mesa_printf("ARG%d", reg - REG_ARG0);
-   else if (reg >= REG_TMP0 && reg < REG_TMP_MAX)
-      _mesa_printf("TMP%d", reg - REG_TMP0);
-   else if (reg >= REG_PAR0 && reg < REG_PAR_MAX)
-      _mesa_printf("PAR%d", reg - REG_PAR0);
-   else
-      _mesa_printf("???");     
-}
-
 static void print_mask( GLuint mask )
 {
    _mesa_printf(".");
@@ -824,45 +641,38 @@ static void print_mask( GLuint mask )
    if (mask&0x8) _mesa_printf("w");
 }
 
-static void print_extern( GLuint file, GLuint idx )
+static void print_reg( GLuint file, GLuint reg )
 {
    static const char *reg_file[] = {
-      "TEMPORARY",
-      "INPUT",
-      "OUTPUT",
+      "REG",
       "LOCAL_PARAM",
       "ENV_PARAM",
-      "NAMED_PARAM",
       "STATE_VAR",
-      "WRITE_ONLY",
-      "ADDRESS"
    };
 
-   _mesa_printf("%s:%d", reg_file[file], idx);
-}
-
-
-
-static void print_SWZ( union instruction op, const struct opcode_info *info )
-{
-   GLuint swz = op.swz.swz;
-   GLuint neg = op.swz.neg;
-   GLuint i;
-
-   _mesa_printf("%s ", info->string);
-   print_reg(op.swz.dst);
-   _mesa_printf(", ");
-   print_reg(op.swz.arg0);
-   _mesa_printf(".");
-   for (i = 0; i < 4; i++, swz >>= 3, neg >>= 1) {
-      const char *cswz = "xyzw01??";
-      if (neg & 0x1)   
-	 _mesa_printf("-");
-      _mesa_printf("%c", cswz[swz&0x7]);
+   if (file == 0) {
+      if (reg == REG_RES) 
+	 _mesa_printf("RES");
+      else if (reg >= REG_ARG0 && reg <= REG_ARG1)
+	 _mesa_printf("ARG%d", reg - REG_ARG0);
+      else if (reg >= REG_TMP0 && reg <= REG_TMP11)
+	 _mesa_printf("TMP%d", reg - REG_TMP0);
+      else if (reg >= REG_IN0 && reg <= REG_IN15)
+	 _mesa_printf("IN%d", reg - REG_IN0);
+      else if (reg >= REG_OUT0 && reg <= REG_OUT14)
+	 _mesa_printf("OUT%d", reg - REG_OUT0);
+      else if (reg == REG_ADDR)
+	 _mesa_printf("ADDR");
+      else if (reg == REG_ID)
+	 _mesa_printf("ID");
+      else
+	 _mesa_printf("REG%d", reg);
    }
-   _mesa_printf("\n");
+   else 
+      _mesa_printf("%s:%d", reg_file[file], reg);
 }
 
+
 static void print_RSW( union instruction op, const struct opcode_info *info )
 {
    GLuint swz = op.rsw.swz;
@@ -870,13 +680,13 @@ static void print_RSW( union instruction op, const struct opcode_info *info )
    GLuint i;
 
    _mesa_printf("%s ", info->string);
-   print_reg(op.rsw.dst);
+   print_reg(0, op.rsw.dst);
    _mesa_printf(", ");
-   print_reg(op.rsw.arg0);
+   print_reg(op.rsw.file0, op.rsw.idx0);
    _mesa_printf(".");
    for (i = 0; i < 4; i++, swz >>= 2) {
       const char *cswz = "xyzw";
-      if (neg)   
+      if (neg & (1<<i))   
 	 _mesa_printf("-");
       _mesa_printf("%c", cswz[swz&0x3]);
    }
@@ -884,193 +694,203 @@ static void print_RSW( union instruction op, const struct opcode_info *info )
 }
 
 
-static void print_SCL( union instruction op, const struct opcode_info *info )
-{
-   _mesa_printf("%s ", info->string);
-   print_reg(op.scl.dst);
-   _mesa_printf(", ");
-   print_reg(op.scl.arg0);
-   if (info->nr_args > 1) {
-      _mesa_printf(", ");
-      print_reg(op.scl.arg1);
-   }
-   _mesa_printf("\n");
-}
-
-
-static void print_VEC( union instruction op, const struct opcode_info *info )
+static void print_ALU( union instruction op, const struct opcode_info *info )
 {
    _mesa_printf("%s ", info->string);
-   print_reg(op.vec.dst);
+   print_reg(0, op.alu.dst);
    _mesa_printf(", ");
-   print_reg(op.vec.arg0);
+   print_reg(op.alu.file0, op.alu.idx0);
    if (info->nr_args > 1) {
       _mesa_printf(", ");
-      print_reg(op.vec.arg1);
-   }
-   if (info->nr_args > 2) {
-      _mesa_printf(", ");
-      print_reg(op.vec.arg2);
+      print_reg(op.alu.file1, op.alu.idx1);
    }
    _mesa_printf("\n");
 }
 
-static void print_MSK( union instruction op, const struct opcode_info *info )
+static void print_SEL( union instruction op, const struct opcode_info *info )
 {
    _mesa_printf("%s ", info->string);
-   print_reg(op.msk.dst);
-   print_mask(op.msk.mask);
+   print_reg(0, op.sel.dst);
    _mesa_printf(", ");
-   print_reg(op.msk.arg0);
-   _mesa_printf("\n");
-}
-
-static void print_IN( union instruction op, const struct opcode_info *info )
-{
-   _mesa_printf("%s ", info->string);
-   print_reg(op.inr.reg);
+   print_reg(0, op.sel.idx0);
+   print_mask(op.sel.mask);
    _mesa_printf(", ");
-   print_extern(op.inr.file, op.inr.idx);
+   print_reg(op.sel.file1, op.sel.idx1);
+   print_mask(~op.sel.mask);
    _mesa_printf("\n");
 }
 
-static void print_OUT( union instruction op, const struct opcode_info *info )
-{
-   _mesa_printf("%s ", info->string);
-   print_extern(op.out.file, op.out.idx);
-   if (op.out.opcode == OUM)
-      print_mask(op.out.mask);
-   _mesa_printf(", ");
-   print_reg(op.out.reg);
-   _mesa_printf("\n");
-}
 
 static void print_NOP( union instruction op, const struct opcode_info *info )
 {
 }
 
 #define NOP 0
-#define VEC 1
-#define SCL 2
-#define SWZ 3
+#define ALU 1
+#define SWZ 2
 
 static const struct opcode_info opcode_info[] = 
 {
-   { VEC, 1, "ABS", do_ABS, print_VEC },
-   { VEC, 2, "ADD", do_ADD, print_VEC },
-   { OUT, 1, "ARL", do_ARL, print_OUT },
-   { SCL, 2, "DP3", do_DP3, print_SCL },
-   { SCL, 2, "DP4", do_DP4, print_SCL },
-   { SCL, 2, "DPH", do_DPH, print_SCL },
-   { VEC, 2, "DST", do_DST, print_VEC },
-   { NOP, 0, "END", do_NOP, print_NOP },
-   { SCL, 1, "EX2", do_EX2, print_VEC },
-   { VEC, 1, "EXP", do_EXP, print_VEC },
-   { VEC, 1, "FLR", do_FLR, print_VEC },
-   { VEC, 1, "FRC", do_FRC, print_VEC },
-   { SCL, 1, "LG2", do_LG2, print_VEC },
-   { VEC, 1, "LIT", do_LIT, print_VEC },
-   { VEC, 1, "LOG", do_LOG, print_VEC },
-   { VEC, 3, "MAD", do_MAD, print_VEC },
-   { VEC, 2, "MAX", do_MAX, print_VEC },
-   { VEC, 2, "MIN", do_MIN, print_VEC },
-   { VEC, 1, "MOV", do_MOV, print_VEC },
-   { VEC, 2, "MUL", do_MUL, print_VEC },
-   { SCL, 2, "POW", do_POW, print_VEC },
-   { VEC, 1, "PRT", do_PRT, print_VEC }, /* PRINT */
-   { NOP, 1, "RCC", do_NOP, print_NOP },
-   { SCL, 1, "RCP", do_RCP, print_VEC },
-   { SCL, 1, "RSQ", do_RSQ, print_VEC },
-   { VEC, 2, "SGE", do_SGE, print_VEC },
-   { VEC, 2, "SLT", do_SLT, print_VEC },
-   { VEC, 2, "SUB", do_SUB, print_VEC },
-   { SWZ, 1, "SWZ", do_SWZ, print_SWZ },
-   { VEC, 2, "XPD", do_XPD, print_VEC },
-   { IN4, 1, "IN1", do_IN1, print_IN }, /* Internals */
-   { IN4, 1, "IN2", do_IN2, print_IN },
-   { IN4, 1, "IN3", do_IN3, print_IN },
-   { IN4, 1, "IN4", do_IN4, print_IN },
-   { OUT, 1, "OUT", do_OUT, print_OUT },
-   { OUT, 1, "OUM", do_OUM, print_OUT },
-   { SWZ, 1, "RSW", do_RSW, print_RSW },
-   { MSK, 1, "MSK", do_MSK, print_MSK },
-   { IN4, 1, "PAR", do_PAR, print_IN },
-   { IN4, 1, "PRL", do_PRL, print_IN },
+   { 1, "ABS", print_ALU },
+   { 2, "ADD", print_ALU },
+   { 1, "ARL", print_ALU },
+   { 2, "DP3", print_ALU },
+   { 2, "DP4", print_ALU },
+   { 2, "DPH", print_ALU },
+   { 2, "DST", print_ALU },
+   { 0, "END", print_NOP },
+   { 1, "EX2", print_ALU },
+   { 1, "EXP", print_ALU },
+   { 1, "FLR", print_ALU },
+   { 1, "FRC", print_ALU },
+   { 1, "LG2", print_ALU },
+   { 1, "LIT", print_ALU },
+   { 1, "LOG", print_ALU },
+   { 3, "MAD", print_NOP },
+   { 2, "MAX", print_ALU },
+   { 2, "MIN", print_ALU },
+   { 1, "MOV", print_ALU },
+   { 2, "MUL", print_ALU },
+   { 2, "POW", print_ALU },
+   { 1, "PRT", print_ALU }, /* PRINT */
+   { 1, "RCC", print_NOP },
+   { 1, "RCP", print_ALU },
+   { 1, "RSQ", print_ALU },
+   { 2, "SGE", print_ALU },
+   { 2, "SLT", print_ALU },
+   { 2, "SUB", print_ALU },
+   { 1, "SWZ", print_NOP },
+   { 2, "XPD", print_ALU },
+   { 1, "RSW", print_RSW },
+   { 2, "SEL", print_SEL },
+   { 1, "REL", print_ALU },
 };
 
 
-static GLuint cvp_load_reg( struct compilation *cp,
-			    GLuint file,
-			    GLuint index,
-			    GLuint rel )
+static void (* const opcode_func[])(struct arb_vp_machine *, union instruction) = 
+{
+   do_ABS,
+   do_ADD,
+   do_ARL,
+   do_DP3,
+   do_DP4,
+   do_DPH,
+   do_DST,
+   do_NOP,
+   do_EX2,
+   do_EXP,
+   do_FLR,
+   do_FRC,
+   do_LG2,
+   do_LIT,
+   do_LOG,
+   do_NOP,
+   do_MAX,
+   do_MIN,
+   do_MOV,
+   do_MUL,
+   do_POW,
+   do_PRT,
+   do_NOP,
+   do_RCP,
+   do_RSQ,
+   do_SGE,
+   do_SLT,
+   do_SUB,
+   do_RSW,
+   do_XPD,
+   do_RSW,
+   do_SEL,
+   do_REL,
+};
+
+static union instruction *cvp_next_instruction( struct compilation *cp )
 {
-   GLuint i, op;
+   union instruction *op = cp->csr++;
+   op->dword = 0;
+   return op;
+}
 
-   if (file == PROGRAM_TEMPORARY)
-      return index + REG_TMP0;
+static struct reg cvp_make_reg( GLuint file, GLuint idx )
+{
+   struct reg reg;
+   reg.file = file;
+   reg.idx = idx;
+   return reg;
+}
 
-   /* Don't try to cache relatively addressed values yet:
-    */
-   if (!rel) {
-      for (i = 0; i < REG_PAR_NR; i++) {
-	 if ((cp->par_active & (1<<i)) &&
-	     cp->reg[i].file == file &&
-	     cp->reg[i].idx == index) {
-	    cp->par_protected |= (1<<i);
-	    return i + REG_PAR0;
-	 }
-      }
-   }
+static struct reg cvp_emit_rel( struct compilation *cp,
+				struct reg reg,
+				struct reg tmpreg )
+{
+   union instruction *op = cvp_next_instruction(cp);
+   op->alu.opcode = REL;
+   op->alu.file0 = reg.file;
+   op->alu.idx0 = reg.idx;
+   op->alu.dst = tmpreg.idx;
+   return tmpreg;
+}
 
-   /* Not already loaded, so identify a slot and load it.  
-    * TODO: preload these values once only!
-    * TODO: better eviction strategy!
-    */
-   if (cp->par_active == ~0) {
-      assert(cp->par_protected != ~0);
-      cp->par_active = cp->par_protected;
-   }
 
-   i = ffs(~cp->par_active);
-   assert(i);
-   i--;
+static struct reg cvp_load_reg( struct compilation *cp,
+				GLuint file,
+				GLuint index,
+				GLuint rel,
+				GLuint tmpidx )
+{
+   struct reg tmpreg = cvp_make_reg(FILE_REG, tmpidx);
+   struct reg reg;
 
+   switch (file) {
+   case PROGRAM_TEMPORARY:
+      return cvp_make_reg(FILE_REG, REG_TMP0 + index);
 
-   if (file == PROGRAM_INPUT) 
-      op = IN1 + cp->VB->AttribPtr[index]->size - 1;
-   else if (rel)
-      op = PRL;
-   else
-      op = PAR;
-
-   cp->csr->dword = 0;
-   cp->csr->inr.opcode = op;
-   cp->csr->inr.reg = i + REG_PAR0;
-   cp->csr->inr.file = file;
-   cp->csr->inr.idx = index;
-   cp->csr++;
-
-   cp->reg[i].file = file;
-   cp->reg[i].idx = index;
-   cp->par_protected |= (1<<i);
-   cp->par_active |= (1<<i);
-   return i + REG_PAR0;
-}
-
-static void cvp_release_regs( struct compilation *cp )
-{
-   cp->par_protected = 0;
-}
+   case PROGRAM_INPUT:
+      return cvp_make_reg(FILE_REG, REG_IN0 + index);
 
+   case PROGRAM_OUTPUT:
+      return cvp_make_reg(FILE_REG, REG_OUT0 + index);
 
+      /* These two aren't populated by the parser?
+       */
+   case PROGRAM_LOCAL_PARAM: 
+      reg = cvp_make_reg(FILE_LOCAL_PARAM, index);
+      if (rel) 
+	 return cvp_emit_rel(cp, reg, tmpreg);
+      else
+	 return reg;
+
+   case PROGRAM_ENV_PARAM: 
+      reg = cvp_make_reg(FILE_ENV_PARAM, index);
+      if (rel) 
+	 return cvp_emit_rel(cp, reg, tmpreg);
+      else
+	 return reg;
+
+   case PROGRAM_STATE_VAR:
+      reg = cvp_make_reg(FILE_STATE_PARAM, index);
+      if (rel) 
+	 return cvp_emit_rel(cp, reg, tmpreg);
+      else
+	 return reg;
+
+      /* Invalid values:
+       */
+   case PROGRAM_WRITE_ONLY:
+   case PROGRAM_ADDRESS:
+   default:
+      assert(0);
+      return tmpreg;		/* can't happen */
+   }
+}
 
-static GLuint cvp_emit_arg( struct compilation *cp,
-			    const struct vp_src_register *src,
-			    GLuint arg )
+static struct reg cvp_emit_arg( struct compilation *cp,
+				const struct vp_src_register *src,
+				GLuint arg )
 {
-   GLuint reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr );
+   struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
    union instruction rsw, noop;
-
+   
    /* Emit any necessary swizzling.  
     */
    rsw.dword = 0;
@@ -1088,12 +908,13 @@ static GLuint cvp_emit_arg( struct compilation *cp,
 		   (3<<6));
 
    if (rsw.dword != noop.dword) {
-      GLuint rsw_reg = arg;
-      cp->csr->dword = rsw.dword;
-      cp->csr->rsw.opcode = RSW;
-      cp->csr->rsw.arg0 = reg;
-      cp->csr->rsw.dst = rsw_reg;
-      cp->csr++;
+      union instruction *op = cvp_next_instruction(cp);
+      struct reg rsw_reg = cvp_make_reg(FILE_REG, REG_ARG0 + arg);
+      op->dword = rsw.dword;
+      op->rsw.opcode = RSW;
+      op->rsw.file0 = reg.file;
+      op->rsw.idx0 = reg.idx;
+      op->rsw.dst = rsw_reg.idx;
       return rsw_reg;
    }
    else
@@ -1102,48 +923,82 @@ static GLuint cvp_emit_arg( struct compilation *cp,
 
 static GLuint cvp_choose_result( struct compilation *cp,
 				 const struct vp_dst_register *dst,
-				 union instruction *fixup,
-				 GLuint maxreg)
+				 union instruction *fixup )
 {
    GLuint mask = dst->WriteMask;
+   GLuint idx;
 
-   if (dst->File == PROGRAM_TEMPORARY) {
-      
-      /* Optimization: When writing (with a writemask) to an undefined
-       * value for the first time, the writemask may be ignored.  In
-       * practise this means that the MSK instruction to implement the
-       * writemask can be dropped.
+   switch (dst->File) {
+   case PROGRAM_TEMPORARY:
+      idx = REG_TMP0 + dst->Index;
+      break;
+   case PROGRAM_OUTPUT:
+      idx = REG_OUT0 + dst->Index;
+      break;
+   default:
+      assert(0);
+      return REG_RES;		/* can't happen */
+   }
+
+   /* Optimization: When writing (with a writemask) to an undefined
+    * value for the first time, the writemask may be ignored. 
+    */
+   if (mask != WRITEMASK_XYZW && (cp->reg_active & (1 << idx))) {
+      fixup->sel.opcode = SEL;
+      fixup->sel.idx0 = REG_RES;
+      fixup->sel.file1 = FILE_REG;
+      fixup->sel.idx1 = idx;
+      fixup->sel.dst = idx;
+      fixup->sel.mask = mask;
+      cp->reg_active |= 1 << idx;
+      return REG_RES;
+   }
+   else {
+      fixup->dword = 0;
+      cp->reg_active |= 1 << idx;
+      return idx;
+   }
+}
+
+#define RSW_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+
+static struct reg cvp_emit_rsw( struct compilation *cp, 
+				GLuint dst,
+				struct reg src,
+				GLuint neg, 
+				GLuint swz,
+				GLboolean force)
+{
+   struct reg retval;
+
+   if (swz != RSW_NOOP || neg != 0) {
+      union instruction *op = cvp_next_instruction(cp);
+      op->rsw.opcode = RSW;
+      op->rsw.dst = dst;
+      op->rsw.file0 = src.file;
+      op->rsw.idx0 = src.idx;
+      op->rsw.neg = neg;
+      op->rsw.swz = swz;
+	    
+      retval.file = FILE_REG;
+      retval.idx = dst;
+      return retval;
+   }
+   else if (force) {
+      /* Oops.  Degenerate case:
        */
-      if (dst->Index < maxreg &&
-	  (mask == 0xf || !(cp->tmp_active & (1<<dst->Index)))) {
-	 fixup->dword = 0;
-	 cp->tmp_active |= (1<<dst->Index);
-	 return REG_TMP0 + dst->Index;
-      }
-      else if (mask != 0xf) {
-	 fixup->msk.opcode = MSK;
-	 fixup->msk.arg0 = REG_RES;
-	 fixup->msk.dst = REG_TMP0 + dst->Index;
-	 fixup->msk.mask = mask;
-	 cp->tmp_active |= (1<<dst->Index);
-	 return REG_RES;
-      }
-      else {
-	 fixup->vec.opcode = VP_OPCODE_MOV;
-	 fixup->vec.arg0 = REG_RES;
-	 fixup->vec.dst = REG_TMP0 + dst->Index;
-	 cp->tmp_active |= (1<<dst->Index);
-	 return REG_RES;
-      }
+      union instruction *op = cvp_next_instruction(cp);
+      op->alu.opcode = VP_OPCODE_MOV;
+      op->alu.dst = dst;
+      op->alu.file0 = src.file;
+      op->alu.idx0 = src.idx;
+      
+      retval.file = FILE_REG;
+      retval.idx = dst;
+      return retval;
    }
    else {
-      assert(dst->File == PROGRAM_OUTPUT);
-      fixup->out.opcode = (mask == 0xf) ? OUT : OUM;
-      fixup->out.reg = REG_RES;
-      fixup->out.file = dst->File;
-      fixup->out.idx = dst->Index;
-      fixup->out.mask = mask;
-      return REG_RES;
+      return src;
    }
 }
 
@@ -1152,76 +1007,129 @@ static void cvp_emit_inst( struct compilation *cp,
 			   const struct vp_instruction *inst )
 {
    const struct opcode_info *info = &opcode_info[inst->Opcode];
+   union instruction *op;
    union instruction fixup;
-   GLuint reg[3];
+   struct reg reg[3];
    GLuint result, i;
 
    /* Need to handle SWZ, ARL specially.
     */
-   switch (info->type) {
-   case OUT:
-      assert(inst->Opcode == VP_OPCODE_ARL);
-      reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
-
-      cp->csr->dword = 0;
-      cp->csr->out.opcode = inst->Opcode;
-      cp->csr->out.reg = reg[0];
-      cp->csr->out.file = PROGRAM_ADDRESS;
-      cp->csr->out.idx = 0;
+   switch (inst->Opcode) {
+      /* Split into mul and add:
+       */
+   case VP_OPCODE_MAD:
+      result = cvp_choose_result( cp, &inst->DstReg, &fixup );
+      for (i = 0; i < 3; i++) 
+	 reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0+i );
+
+      op = cvp_next_instruction(cp);
+      op->alu.opcode = VP_OPCODE_MUL;
+      op->alu.file0 = reg[0].file;
+      op->alu.idx0 = reg[0].idx;
+      op->alu.file1 = reg[1].file;
+      op->alu.idx1 = reg[1].idx;
+      op->alu.dst = REG_ARG0;
+
+      op = cvp_next_instruction(cp);
+      op->alu.opcode = VP_OPCODE_ADD;
+      op->alu.file0 = FILE_REG;
+      op->alu.idx0 = REG_ARG0;
+      op->alu.file1 = reg[2].file;
+      op->alu.idx1 = reg[2].idx;
+      op->alu.dst = result;
       break;
-   case SWZ:
-      assert(inst->Opcode == VP_OPCODE_SWZ);
-      result = cvp_choose_result( cp, &inst->DstReg, &fixup, REG_SWZDST_MAX );
 
+   case VP_OPCODE_ARL:
       reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
 
-      cp->csr->dword = 0;
-      cp->csr->swz.opcode = VP_OPCODE_SWZ;
-      cp->csr->swz.arg0 = reg[0];
-      cp->csr->swz.dst = result;
-      cp->csr->swz.neg = inst->SrcReg[0].Negate;
-      cp->csr->swz.swz = inst->SrcReg[0].Swizzle;
-      cp->csr++;
-
-      if (result == REG_RES) {
-	 cp->csr->dword = fixup.dword;
-	 cp->csr++;
-      }
+      op = cvp_next_instruction(cp);
+      op->alu.opcode = inst->Opcode;
+      op->alu.dst = REG_ADDR;
+      op->alu.file0 = reg[0].file;
+      op->alu.idx0 = reg[0].idx;
       break;
 
-   case VEC:
-   case SCL:			/* for now */
-      result = cvp_choose_result( cp, &inst->DstReg, &fixup, REG_MAX );
+   case VP_OPCODE_SWZ: {
+      GLuint swz0, swz1;
+      GLuint neg0, neg1;
+      GLuint mask = 0;
 
-      reg[0] = reg[1] = reg[2] = 0;
+      /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
+       * one from the source register the other from a constant
+       * {0,0,0,1}.
+       */
+      for (i = 0; i < 4; i++) {
+	 GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i);
+	 if (swzelt >= SWIZZLE_ZERO) {
+	    neg0 |= inst->SrcReg[0].Negate & (1<<i);
+	    if (swzelt == SWIZZLE_ONE)
+	       swz0 |= SWIZZLE_W << (i*2);
+	    else if (i < SWIZZLE_W)
+	       swz0 |= i << (i*2);
+	 }
+	 else {
+	    mask |= 1<<i;
+	    neg1 |= inst->SrcReg[0].Negate & (1<<i);
+	    swz1 |= swzelt << (i*2);
+	 }
+      }
 
-      for (i = 0; i < info->nr_args; i++)
-	 reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0 + i );
+      result = cvp_choose_result( cp, &inst->DstReg, &fixup );
+      reg[0].file = FILE_REG;
+      reg[0].idx = REG_ID;
+      reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
 
-      cp->csr->dword = 0;
-      cp->csr->vec.opcode = inst->Opcode;
-      cp->csr->vec.arg0 = reg[0];
-      cp->csr->vec.arg1 = reg[1];
-      cp->csr->vec.arg2 = reg[2];
-      cp->csr->vec.dst = result;
-      cp->csr++;
+      if (mask == WRITEMASK_XYZW) {
+	 cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
+	 
+      }
+      else if (mask == 0) {
+	 cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE);
+      }
+      else {
+	 reg[0] = cvp_emit_rsw(cp, REG_ARG0, reg[0], neg0, swz0, GL_FALSE);
+	 reg[1] = cvp_emit_rsw(cp, REG_ARG1, reg[1], neg1, swz1, GL_FALSE);
+
+	 assert(reg[0].file == FILE_REG);
+
+	 op = cvp_next_instruction(cp);
+	 op->sel.opcode = SEL;
+	 op->sel.dst = result;
+	 op->sel.idx0 = reg[0].idx;
+	 op->sel.file1 = reg[1].file;
+	 op->sel.idx1 = reg[1].idx;
+	 op->sel.mask = mask;
+      }
 
       if (result == REG_RES) {
-	 cp->csr->dword = fixup.dword;
-	 cp->csr++;
-      }      	 
+	 op = cvp_next_instruction(cp);
+	 op->dword = fixup.dword;
+      }
       break;
-
-
-   case NOP:
+   }
+   case VP_OPCODE_PRINT:
+   case VP_OPCODE_END:
       break;
 
    default:
-      assert(0);
+      result = cvp_choose_result( cp, &inst->DstReg, &fixup );
+      for (i = 0; i < info->nr_args; i++) 
+	 reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0 + i );
+
+      op = cvp_next_instruction(cp);
+      op->alu.opcode = inst->Opcode;
+      op->alu.file0 = reg[0].file;
+      op->alu.idx0 = reg[0].idx;
+      op->alu.file1 = reg[1].file;
+      op->alu.idx1 = reg[1].idx;
+      op->alu.dst = result;
+
+      if (result == REG_RES) {
+	 op = cvp_next_instruction(cp);
+	 op->dword = fixup.dword;
+      }      	 
       break;
    }
-
-   cvp_release_regs( cp );
 }
 
 
@@ -1254,7 +1162,7 @@ static void compile_vertex_program( struct arb_vp_machine *m,
    if (DISASSEM) {
       for (i = 0; i < m->nr_instructions; i++) {
 	 union instruction insn = m->instructions[i];
-	 const struct opcode_info *info = &opcode_info[insn.vec.opcode];
+	 const struct opcode_info *info = &opcode_info[insn.alu.opcode];
 	 info->print( insn, info );
       }
       _mesa_printf("\n\n");
@@ -1390,15 +1298,62 @@ run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
 
    if (program->Parameters) {
       _mesa_load_state_parameters(ctx, program->Parameters);
-      m->File[PROGRAM_STATE_VAR] = program->Parameters->ParameterValues;
    }   
+   
+
+   /* Initialize regs where necessary:
+    */
+   ASSIGN_4V(m->reg[REG_ID], 0, 0, 0, 1);
+
+   m->nr_inputs = m->nr_outputs = 0;
+
+   for (i = 0; i < 16; i++) {
+      if (program->InputsRead & (1<<i)) {
+	 GLuint j = m->nr_inputs++;
+	 m->input[j].idx = i;
+	 m->input[j].data = m->VB->AttribPtr[i]->data;
+	 m->input[j].stride = m->VB->AttribPtr[i]->stride;
+	 m->input[j].size = m->VB->AttribPtr[i]->size;
+	 ASSIGN_4V(m->reg[REG_IN0 + i], 0, 0, 0, 1);
+      }
+   }     
+
+   for (i = 0; i < 15; i++) {
+      if (program->OutputsWritten & (1<<i)) {
+	 GLuint j = m->nr_outputs++;
+	 m->output[j].idx = i;
+	 m->output[j].data = m->attribs[i].data;
+      }
+   }     
+
 
    /* Run the actual program:
     */
    for (m->vtx_nr = 0; m->vtx_nr < VB->Count; m->vtx_nr++) {
+      for (j = 0; j < m->nr_inputs; j++) {
+	 GLuint idx = REG_IN0 + m->input[j].idx;
+	 switch (m->input[j].size) {
+	 case 4: m->reg[idx][3] = m->input[j].data[3];
+	 case 3: m->reg[idx][2] = m->input[j].data[2];
+	 case 2: m->reg[idx][1] = m->input[j].data[1];
+	 case 1: m->reg[idx][0] = m->input[j].data[0];
+	 }
+
+	 STRIDE_F(m->input[j].data, m->input[j].stride);
+      }
+
       for (j = 0; j < m->nr_instructions; j++) {
 	 union instruction inst = m->instructions[j];	 
-	 opcode_info[inst.vec.opcode].func( m, inst );
+	 opcode_func[inst.alu.opcode]( m, inst );
+      }
+
+      for (j = 0; j < m->nr_outputs; j++) {
+	 GLuint idx = REG_OUT0 + m->output[j].idx;
+	 m->output[j].data[0] = m->reg[idx][0];
+	 m->output[j].data[1] = m->reg[idx][1];
+	 m->output[j].data[2] = m->reg[idx][2];
+	 m->output[j].data[3] = m->reg[idx][3];
+	 m->output[j].data += 4;
       }
    }
 
@@ -1488,9 +1443,10 @@ validate_vertex_program( GLcontext *ctx, struct tnl_pipeline_stage *stage )
       
       /* Grab the state GL state and put into registers:
        */
-      m->File[PROGRAM_LOCAL_PARAM] = program->Base.LocalParams;
-      m->File[PROGRAM_ENV_PARAM] = ctx->VertexProgram.Parameters;
-      m->File[PROGRAM_STATE_VAR] = 0;
+      m->File[FILE_REG] = m->reg;
+      m->File[FILE_LOCAL_PARAM] = program->Base.LocalParams;
+      m->File[FILE_ENV_PARAM] = ctx->VertexProgram.Parameters;
+      m->File[FILE_STATE_PARAM] = program->Parameters->ParameterValues;
    }
 }