r300: rewrite and hopefully simplify RS setup
[mesa.git] / src / mesa / drivers / dri / r300 / r300_state.c
index e82c3d9681c53bdccc94a55ec43d506a7d6962af..171a1549188b9b240b37069a08ef1b1fe383cd10 100644 (file)
@@ -35,23 +35,23 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \author Nicolai Haehnle <prefect_@gmx.net>
  */
 
-#include "glheader.h"
-#include "state.h"
-#include "imports.h"
-#include "enums.h"
-#include "macros.h"
-#include "context.h"
-#include "dd.h"
-#include "simple_list.h"
-
-#include "api_arrayelt.h"
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/dd.h"
+#include "main/simple_list.h"
+#include "main/api_arrayelt.h"
+#include "main/texformat.h"
+
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
 #include "shader/prog_parameter.h"
 #include "shader/prog_statevars.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
-#include "texformat.h"
 
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
@@ -70,20 +70,28 @@ extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
 
 static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
 {
-       GLubyte color[4];
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
        R300_STATECHANGE(rmesa, blend_color);
 
-       CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
-       CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
-       CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
-       CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+               GLuint r = IROUND(cf[0]*1023.0f);
+               GLuint g = IROUND(cf[1]*1023.0f);
+               GLuint b = IROUND(cf[2]*1023.0f);
+               GLuint a = IROUND(cf[3]*1023.0f);
+
+               rmesa->hw.blend_color.cmd[1] = r | (a << 16);
+               rmesa->hw.blend_color.cmd[2] = b | (g << 16);
+       } else {
+               GLubyte color[4];
+               CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
+               CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
+               CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
+               CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
 
-       rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0],
-                                                      color[1], color[2]);
-       rmesa->hw.blend_color.cmd[2] = 0;
-       rmesa->hw.blend_color.cmd[3] = 0;
+               rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0],
+                                                       color[1], color[2]);
+       }
 }
 
 /**
@@ -313,6 +321,44 @@ static void r300BlendFuncSeparate(GLcontext * ctx,
        r300SetBlendState(ctx);
 }
 
+/**
+ * Translate LogicOp enums into hardware representation.
+ * Both use a very logical bit-wise layout, but unfortunately the order
+ * of bits is reversed.
+ */
+static GLuint translate_logicop(GLenum logicop)
+{
+       GLuint bits = logicop - GL_CLEAR;
+       bits = ((bits & 1) << 3) | ((bits & 2) << 1) | ((bits & 4) >> 1) | ((bits & 8) >> 3);
+       return bits << R300_RB3D_ROPCNTL_ROP_SHIFT;
+}
+
+/**
+ * Used internally to update the r300->hw hardware state to match the
+ * current OpenGL state.
+ */
+static void r300SetLogicOpState(GLcontext *ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       R300_STATECHANGE(r300, rop);
+       if (RGBA_LOGICOP_ENABLED(ctx)) {
+               r300->hw.rop.cmd[1] = R300_RB3D_ROPCNTL_ROP_ENABLE |
+                       translate_logicop(ctx->Color.LogicOp);
+       } else {
+               r300->hw.rop.cmd[1] = 0;
+       }
+}
+
+/**
+ * Called by Mesa when an application program changes the LogicOp state
+ * via glLogicOp.
+ */
+static void r300LogicOpcode(GLcontext *ctx, GLenum logicop)
+{
+       if (RGBA_LOGICOP_ENABLED(ctx))
+               r300SetLogicOpState(ctx);
+}
+
 static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
@@ -701,8 +747,6 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
 
        switch (pname) {
        case GL_FOG_MODE:
-               if (!ctx->Fog.Enabled)
-                       return;
                switch (ctx->Fog.Mode) {
                case GL_LINEAR:
                        R300_STATECHANGE(r300, fogs);
@@ -739,6 +783,7 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
                            R300_FG_FOG_BLEND_FN_EXP2;
                        fogScale.f = 0.3 * ctx->Fog.Density;
                        fogStart.f = 0.0;
+                        break;
                default:
                        return;
                }
@@ -827,6 +872,31 @@ static void r300PointSize(GLcontext * ctx, GLfloat size)
            ((int)(size * 6) << R300_POINTSIZE_Y_SHIFT);
 }
 
+static void r300PointParameter(GLcontext * ctx, GLenum pname, const GLfloat * param)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       switch (pname) {
+       case GL_POINT_SIZE_MIN:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MIN_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MinSize * 6.0);
+               break;
+       case GL_POINT_SIZE_MAX:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MAX_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MaxSize * 6.0)
+                       << R300_GA_POINT_MINMAX_MAX_SHIFT;
+               break;
+       case GL_POINT_DISTANCE_ATTENUATION:
+               break;
+       case GL_POINT_FADE_THRESHOLD_SIZE:
+               break;
+       default:
+               break;
+       }
+}
+
 /* =============================================================
  * Line state
  */
@@ -905,15 +975,9 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        GLuint refmask =
-           (((ctx->Stencil.
-              Ref[0] & 0xff) << R300_STENCILREF_SHIFT) | ((ctx->
-                                                           Stencil.
-                                                           ValueMask
-                                                           [0] &
-                                                           0xff)
-                                                          <<
-                                                          R300_STENCILMASK_SHIFT));
-
+           ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
+            | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
+       const unsigned back = ctx->Stencil._BackFace;
        GLuint flag;
 
        R300_STATECHANGE(rmesa, zs);
@@ -931,8 +995,7 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
            (flag << R300_S_FRONT_FUNC_SHIFT);
 
-       if (ctx->Stencil._TestTwoSide)
-               flag = translate_func(ctx->Stencil.Function[1]);
+       flag = translate_func(ctx->Stencil.Function[back]);
 
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
            (flag << R300_S_BACK_FUNC_SHIFT);
@@ -957,6 +1020,7 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
                                  GLenum fail, GLenum zfail, GLenum zpass)
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       const unsigned back = ctx->Stencil._BackFace;
 
        R300_STATECHANGE(rmesa, zs);
        /* It is easier to mask what's left.. */
@@ -973,23 +1037,13 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
            | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
               R300_S_FRONT_ZPASS_OP_SHIFT);
 
-       if (ctx->Stencil._TestTwoSide) {
-               rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   (translate_stencil_op(ctx->Stencil.FailFunc[1]) <<
-                    R300_S_BACK_SFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZFailFunc[1]) <<
-                      R300_S_BACK_ZFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZPassFunc[1]) <<
-                      R300_S_BACK_ZPASS_OP_SHIFT);
-       } else {
-               rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
-                    R300_S_BACK_SFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
-                      R300_S_BACK_ZFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
-                      R300_S_BACK_ZPASS_OP_SHIFT);
-       }
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+           (translate_stencil_op(ctx->Stencil.FailFunc[back]) <<
+            R300_S_BACK_SFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZFailFunc[back]) <<
+              R300_S_BACK_ZFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZPassFunc[back]) <<
+              R300_S_BACK_ZPASS_OP_SHIFT);
 }
 
 /* =============================================================
@@ -1145,7 +1199,7 @@ r300FetchStateParameter(GLcontext * ctx,
 
                case STATE_R300_TEXRECT_FACTOR:{
                                struct gl_texture_object *t =
-                                   ctx->Texture.Unit[state[2]].CurrentRect;
+                                   ctx->Texture.Unit[state[2]].CurrentTex[TEXTURE_RECT_INDEX];
 
                                if (t && t->Image[0][t->BaseLevel]) {
                                        struct gl_texture_image *image =
@@ -1256,8 +1310,8 @@ static unsigned long gen_fixed_filter(unsigned long f)
            (R300_TX_CLAMP << R300_TX_WRAP_T_SHIFT)) {
                needs_fixing |= 2;
        }
-       if ((f & ((7 - 1) << R300_TX_WRAP_Q_SHIFT)) ==
-           (R300_TX_CLAMP << R300_TX_WRAP_Q_SHIFT)) {
+       if ((f & ((7 - 1) << R300_TX_WRAP_R_SHIFT)) ==
+           (R300_TX_CLAMP << R300_TX_WRAP_R_SHIFT)) {
                needs_fixing |= 4;
        }
 
@@ -1297,8 +1351,8 @@ static unsigned long gen_fixed_filter(unsigned long f)
                f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_T_SHIFT;
        }
        if (needs_fixing & 4) {
-               f &= ~((7 - 1) << R300_TX_WRAP_Q_SHIFT);
-               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_Q_SHIFT;
+               f &= ~((7 - 1) << R300_TX_WRAP_R_SHIFT);
+               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_R_SHIFT;
        }
        return f;
 }
@@ -1351,14 +1405,15 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
        int i;
        struct r500_fragment_program *fp = (struct r500_fragment_program *)
            (char *)ctx->FragmentProgram._Current;
+       struct r500_fragment_program_code *code = &fp->code;
 
        /* find all the texture instructions and relocate the texture units */
-       for (i = 0; i < fp->inst_end + 1; i++) {
-               if ((fp->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
+       for (i = 0; i < code->inst_end + 1; i++) {
+               if ((code->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
                        uint32_t val;
                        int unit, opcode, new_unit;
 
-                       val = fp->inst[i].inst1;
+                       val = code->inst[i].inst1;
 
                        unit = (val >> 16) & 0xf;
 
@@ -1375,11 +1430,21 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
                                }
                        }
                        val |= R500_TEX_ID(new_unit);
-                       fp->inst[i].inst1 = val;
+                       code->inst[i].inst1 = val;
                }
        }
 }
 
+static GLuint translate_lod_bias(GLfloat bias)
+{
+       GLint b = (int)(bias*32);
+       if (b >= (1 << 9))
+               b = (1 << 9)-1;
+       else if (b < -(1 << 9))
+               b = -(1 << 9);
+       return (((GLuint)b) << R300_LOD_BIAS_SHIFT) & R300_LOD_BIAS_MASK;
+}
+
 static void r300SetupTextures(GLcontext * ctx)
 {
        int i, mtu;
@@ -1443,7 +1508,14 @@ static void r300SetupTextures(GLcontext * ctx)
                        r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
                                                hw_tmu] =
                            gen_fixed_filter(t->filter) | (hw_tmu << 28);
-                       r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->filter_1;
+                       /* Note: There is a LOD bias per texture unit and a LOD bias
+                        * per texture object. We add them here to get the correct behaviour.
+                        * (The per-texture object LOD bias was introduced in OpenGL 1.4
+                        * and is not present in the EXT_texture_object extension).
+                        */
+                       r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+                               t->filter_1 |
+                               translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
                        r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
                            t->size;
                        r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
@@ -1523,18 +1595,14 @@ union r300_outputs_written {
 static void r300SetupRSUnit(GLcontext * ctx)
 {
        r300ContextPtr r300 = R300_CONTEXT(ctx);
-       /* I'm still unsure if these are needed */
-       GLuint interp_col[8];
         TNLcontext *tnl = TNL_CONTEXT(ctx);
        struct vertex_buffer *VB = &tnl->vb;
        union r300_outputs_written OutputsWritten;
        GLuint InputsRead;
        int fp_reg, high_rr;
-       int col_interp_nr;
-       int rs_tex_count = 0, rs_col_count = 0;
-       int i, count;
-
-       memset(interp_col, 0, sizeof(interp_col));
+       int col_ip, tex_ip;
+       int rs_tex_count = 0;
+       int i, count, col_fmt;
 
        if (hw_tcl_on)
                OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
@@ -1552,44 +1620,66 @@ static void r300SetupRSUnit(GLcontext * ctx)
        R300_STATECHANGE(r300, rc);
        R300_STATECHANGE(r300, rr);
 
-       fp_reg = col_interp_nr = high_rr = 0;
-
-       r300->hw.rr.cmd[R300_RR_INST_1] = 0;
+       fp_reg = col_ip = tex_ip = col_fmt = 0;
 
-       if (InputsRead & FRAG_BIT_WPOS) {
-               for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-                       if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-                               break;
+       r300->hw.rc.cmd[1] = 0;
+       r300->hw.rc.cmd[2] = 0;
+       for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+               r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
 
-               if (i == ctx->Const.MaxTextureUnits) {
-                       fprintf(stderr, "\tno free texcoord found...\n");
-                       _mesa_exit(-1);
-               }
+       for (i=0; i<R300_RI_CMDSIZE-1; ++i)
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
 
-               InputsRead |= (FRAG_BIT_TEX0 << i);
-               InputsRead &= ~FRAG_BIT_WPOS;
-       }
 
        if (InputsRead & FRAG_BIT_COL0) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-               interp_col[0] |= R300_RS_COL_PTR(rs_col_count);
-               if (count == 3)
-                       interp_col[0] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL0;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+               }
        }
-       else
-               interp_col[0] = R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
 
        if (InputsRead & FRAG_BIT_COL1) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-               if (count == 3)
-                       interp_col[1] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
-               interp_col[1] |= R300_RS_COL_PTR(1);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL1;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+               }
        }
 
-
        for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+               if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+                   continue;
+
+               if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+                   WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+                   continue;
+               }
+
                int swiz;
 
                /* with TCL we always seem to route 4 components */
@@ -1598,7 +1688,6 @@ static void r300SetupRSUnit(GLcontext * ctx)
                else
                  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
 
-               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | rs_tex_count;
                switch(count) {
                case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
                case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
@@ -1607,63 +1696,48 @@ static void r300SetupRSUnit(GLcontext * ctx)
                case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
                };
 
-               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] |= swiz;
-
-               r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
-               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-
-                       rs_tex_count += count;
-
-                       //assert(r300->state.texture.tc_count != 0);
-                       r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R300_RS_INST_TEX_CN_WRITE | i       /* source INTERP */
-                           | (fp_reg << R300_RS_INST_TEX_ADDR_SHIFT);
-                       high_rr = fp_reg;
-
-                       /* Passing invalid data here can lock the GPU. */
-                       if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
-                               InputsRead &= ~(FRAG_BIT_TEX0 << i);
-                               fp_reg++;
-                       } else {
-                               WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
-                       }
-               }
-       }
-
-       if (InputsRead & FRAG_BIT_COL0) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-                       r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL0;
-                       col_interp_nr++;
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz | R300_RS_TEX_PTR(rs_tex_count);
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~(FRAG_BIT_TEX0 << i);
+               rs_tex_count += count;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       if (InputsRead & FRAG_BIT_FOGC) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_FOGC;
+                       rs_tex_count += 4;
+                       ++tex_ip;
+                       ++fp_reg;
                } else {
-                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+                       WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
                }
        }
 
-       if (InputsRead & FRAG_BIT_COL1) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-                       r300->hw.rr.cmd[R300_RR_INST_1] |= R300_RS_INST_COL_ID(1) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL1;
-                       if (high_rr < 1)
-                               high_rr = 1;
-                       col_interp_nr++;
-               } else {
-                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
-               }
+       if (InputsRead & FRAG_BIT_WPOS) {
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~FRAG_BIT_WPOS;
+               rs_tex_count += 4;
+               ++tex_ip;
+               ++fp_reg;
        }
+       InputsRead &= ~FRAG_BIT_WPOS;
 
-       /* Need at least one. This might still lock as the values are undefined... */
-       if (rs_tex_count == 0 && col_interp_nr == 0) {
-               r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
-               col_interp_nr++;
+       /* Setup default color if no color or tex was set */
+       if (rs_tex_count == 0 && col_ip == 0) {
+               r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+               ++col_ip;
        }
 
-       r300->hw.rc.cmd[1] = 0 | (rs_tex_count << R300_IT_COUNT_SHIFT)
-         | (col_interp_nr << R300_IC_COUNT_SHIFT)
-         | R300_HIRES_EN;
+       high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+       r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+       r300->hw.rc.cmd[2] |= high_rr - 1;
 
-       assert(high_rr >= 0);
-       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr + 1);
-       r300->hw.rc.cmd[2] = high_rr;
+       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr);
 
        if (InputsRead)
                WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1672,18 +1746,15 @@ static void r300SetupRSUnit(GLcontext * ctx)
 static void r500SetupRSUnit(GLcontext * ctx)
 {
        r300ContextPtr r300 = R300_CONTEXT(ctx);
-       /* I'm still unsure if these are needed */
-       GLuint interp_col[8];
-       union r300_outputs_written OutputsWritten;
         TNLcontext *tnl = TNL_CONTEXT(ctx);
        struct vertex_buffer *VB = &tnl->vb;
+       union r300_outputs_written OutputsWritten;
        GLuint InputsRead;
        int fp_reg, high_rr;
-       int rs_col_count = 0;
-       int in_texcoords, col_interp_nr;
-       int i, count;
+       int col_ip, tex_ip;
+       int rs_tex_count = 0;
+       int i, count, col_fmt;
 
-       memset(interp_col, 0, sizeof(interp_col));
        if (hw_tcl_on)
                OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
        else
@@ -1700,130 +1771,151 @@ static void r500SetupRSUnit(GLcontext * ctx)
        R300_STATECHANGE(r300, rc);
        R300_STATECHANGE(r300, rr);
 
-       fp_reg = col_interp_nr = high_rr = in_texcoords = 0;
+       fp_reg = col_ip = tex_ip = col_fmt = 0;
 
-       r300->hw.rr.cmd[R300_RR_INST_1] = 0;
-
-       if (InputsRead & FRAG_BIT_WPOS) {
-               for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-                       if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-                               break;
+       r300->hw.rc.cmd[1] = 0;
+       r300->hw.rc.cmd[2] = 0;
+       for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+               r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
 
-               if (i == ctx->Const.MaxTextureUnits) {
-                       fprintf(stderr, "\tno free texcoord found...\n");
-                       _mesa_exit(-1);
-               }
+       for (i=0; i<R500_RI_CMDSIZE-1; ++i)
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
 
-               InputsRead |= (FRAG_BIT_TEX0 << i);
-               InputsRead &= ~FRAG_BIT_WPOS;
-       }
 
        if (InputsRead & FRAG_BIT_COL0) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-               interp_col[0] |= R500_RS_COL_PTR(rs_col_count);
-               if (count == 3)
-                       interp_col[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL0;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+               }
        }
-       else
-               interp_col[0] = R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
 
        if (InputsRead & FRAG_BIT_COL1) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-               interp_col[1] |= R500_RS_COL_PTR(1);
-               if (count == 3)
-                       interp_col[1] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL1;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+               }
        }
 
+
        for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-               GLuint swiz = 0;
+               if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+                   continue;
 
-               /* with TCL we always seem to route 4 components */
-               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-
-                 if (hw_tcl_on)
-                   count = 4;
-                 else
-                   count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-                 /* always have on texcoord */
-                 swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_S_SHIFT;
-                 if (count >= 2)
-                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_T_SHIFT;
-                 else
-                   swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-
-                 if (count >= 3)
-                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_R_SHIFT;
-                 else
-                   swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-
-                 if (count == 4)
-                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_Q_SHIFT;
-                 else
-                   swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-
-               } else
-                  swiz = (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-                         (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-                         (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-                         (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
-
-               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | swiz;
-
-               r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
-               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-                       //assert(r300->state.texture.tc_count != 0);
-                       r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R500_RS_INST_TEX_CN_WRITE | i       /* source INTERP */
-                           | (fp_reg << R500_RS_INST_TEX_ADDR_SHIFT);
-                       high_rr = fp_reg;
-
-                       /* Passing invalid data here can lock the GPU. */
-                       if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
-                               InputsRead &= ~(FRAG_BIT_TEX0 << i);
-                               fp_reg++;
-                       } else {
-                               WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
-                       }
+               if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+                   WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+                   continue;
                }
-       }
 
-       if (InputsRead & FRAG_BIT_COL0) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-                       r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL0;
-                       col_interp_nr++;
+               int swiz = 0;
+
+               /* with TCL we always seem to route 4 components */
+               if (hw_tcl_on)
+                 count = 4;
+               else
+                 count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
+
+               if (count == 4) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= (rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 3) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 2) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 1) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
                } else {
-                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
                }
-       }
 
-       if (InputsRead & FRAG_BIT_COL1) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-                       r300->hw.rr.cmd[R300_RR_INST_1] |= (1 << 12) | R500_RS_INST_COL_CN_WRITE |  (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL1;
-                       if (high_rr < 1)
-                               high_rr = 1;
-                       col_interp_nr++;
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz;
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~(FRAG_BIT_TEX0 << i);
+               rs_tex_count += count;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       if (InputsRead & FRAG_BIT_FOGC) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                               ((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                               ((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                               ((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+                       r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_FOGC;
+                       rs_tex_count += 4;
+                       ++tex_ip;
+                       ++fp_reg;
                } else {
-                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+                       WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
                }
        }
 
-       /* Need at least one. This might still lock as the values are undefined... */
-       if (in_texcoords == 0 && col_interp_nr == 0) {
-               r300->hw.rr.cmd[R300_RR_INST_0] |= 0 | R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
-               col_interp_nr++;
+       if (InputsRead & FRAG_BIT_WPOS) {
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                               ((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                               ((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                               ((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~FRAG_BIT_WPOS;
+               rs_tex_count += 4;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       /* Setup default color if no color or tex was set */
+       if (rs_tex_count == 0 && col_ip == 0) {
+               r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+               ++col_ip;
        }
 
-       r300->hw.rc.cmd[1] = 0 | (in_texcoords << R300_IT_COUNT_SHIFT)
-         | (col_interp_nr << R300_IC_COUNT_SHIFT)
-         | R300_HIRES_EN;
+       high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+       r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+       r300->hw.rc.cmd[2] |= 0xC0 | (high_rr - 1);
 
-       assert(high_rr >= 0);
-       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr + 1);
-       r300->hw.rc.cmd[2] = 0xC0 | high_rr;
+       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr);
 
        if (InputsRead)
                WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1839,7 +1931,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
        if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
        }while(0)
 
-static inline void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
+static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
 {
        int i;
 
@@ -1920,13 +2012,14 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
     if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (2 << R300_PVS_NUM_FPUS_SHIFT);
     else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
-            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560))
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (5 << R300_PVS_NUM_FPUS_SHIFT);
-    else if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420)
+    else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (6 << R300_PVS_NUM_FPUS_SHIFT);
     else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
-            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580) ||
-            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (8 << R300_PVS_NUM_FPUS_SHIFT);
     else
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (4 << R300_PVS_NUM_FPUS_SHIFT);
@@ -2065,8 +2158,10 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
        case GL_ALPHA_TEST:
                r300SetAlphaState(ctx);
                break;
-       case GL_BLEND:
        case GL_COLOR_LOGIC_OP:
+               r300SetLogicOpState(ctx);
+               /* fall-through, because logic op overrides blending */
+       case GL_BLEND:
                r300SetBlendState(ctx);
                break;
        case GL_CLIP_PLANE0:
@@ -2123,7 +2218,7 @@ static void r300ResetHwState(r300ContextPtr r300)
        r300DepthFunc(ctx, ctx->Depth.Func);
 
        /* stencil */
-       r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+       r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
        r300StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
        r300StencilFuncSeparate(ctx, 0, ctx->Stencil.Function[0],
                                ctx->Stencil.Ref[0], ctx->Stencil.ValueMask[0]);
@@ -2136,6 +2231,7 @@ static void r300ResetHwState(r300ContextPtr r300)
        r300UpdateTextureState(ctx);
 
        r300SetBlendState(ctx);
+       r300SetLogicOpState(ctx);
 
        r300AlphaFunc(ctx, ctx->Color.AlphaFunc, ctx->Color.AlphaRef);
        r300Enable(ctx, GL_ALPHA_TEST, ctx->Color.AlphaEnabled);
@@ -2402,6 +2498,27 @@ void r300UpdateShaders(r300ContextPtr rmesa)
        r300UpdateStateParameters(ctx, _NEW_PROGRAM);
 }
 
+static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
+       struct gl_program *program, struct prog_src_register srcreg)
+{
+       static const GLfloat dummy[4] = { 0, 0, 0, 0 };
+
+       switch(srcreg.File) {
+       case PROGRAM_LOCAL_PARAM:
+               return program->LocalParams[srcreg.Index];
+       case PROGRAM_ENV_PARAM:
+               return ctx->FragmentProgram.Parameters[srcreg.Index];
+       case PROGRAM_STATE_VAR:
+       case PROGRAM_NAMED_PARAM:
+       case PROGRAM_CONSTANT:
+               return program->Parameters->ParameterValues[srcreg.Index];
+       default:
+               _mesa_problem(ctx, "get_fragmentprogram_constant: Unknown\n");
+               return dummy;
+       }
+}
+
+
 static void r300SetupPixelShader(r300ContextPtr rmesa)
 {
        GLcontext *ctx = rmesa->radeon.glCtx;
@@ -2424,26 +2541,17 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
        r300SetupTextures(ctx);
 
        R300_STATECHANGE(rmesa, fpi[0]);
-       rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu_end + 1);
-       for (i = 0; i <= code->alu_end; i++) {
-               rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
-       }
-
        R300_STATECHANGE(rmesa, fpi[1]);
-       rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu_end + 1);
-       for (i = 0; i <= code->alu_end; i++) {
-               rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
-       }
-
        R300_STATECHANGE(rmesa, fpi[2]);
-       rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu_end + 1);
-       for (i = 0; i <= code->alu_end; i++) {
-               rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
-       }
-
        R300_STATECHANGE(rmesa, fpi[3]);
-       rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu_end + 1);
-       for (i = 0; i <= code->alu_end; i++) {
+       rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu.length);
+       rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu.length);
+       rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu.length);
+       rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+       for (i = 0; i < code->alu.length; i++) {
+               rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
+               rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
+               rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
                rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3;
        }
 
@@ -2451,10 +2559,10 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
        rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3);
        rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx;
        rmesa->hw.fp.cmd[R300_FP_CNTL2] =
-         (code->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
-         (code->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
-         (code->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
-         (code->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
+         (0 << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
+         ((code->alu.length-1) << R300_PFS_CNTL_ALU_END_SHIFT) |
+         (0 << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
+         ((code->tex.length ? code->tex.length-1 : 0) << R300_PFS_CNTL_TEX_END_SHIFT);
        /* I just want to say, the way these nodes are stored.. weird.. */
        for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) {
                if (i < (code->cur_node + 1)) {
@@ -2472,10 +2580,12 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
        R300_STATECHANGE(rmesa, fpp);
        rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
        for (i = 0; i < code->const_nr; i++) {
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(code->constant[i][0]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(code->constant[i][1]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(code->constant[i][2]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(code->constant[i][3]);
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(constant[3]);
        }
 }
 
@@ -2499,6 +2609,7 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
        struct r500_fragment_program *fp = (struct r500_fragment_program *)
            (char *)ctx->FragmentProgram._Current;
        int i;
+       struct r500_fragment_program_code *code;
 
        if (!fp)                /* should only happenen once, just after context is created */
                return;
@@ -2512,42 +2623,55 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
                        __FUNCTION__);
                return;
        }
+       code = &fp->code;
+
+       if (fp->mesa_program.FogOption != GL_NONE) {
+               /* Enable HW fog. Try not to squish GL context.
+                * (Anybody sane remembered to set glFog() opts first!) */
+               r300SetFogState(ctx, GL_TRUE);
+               ctx->Fog.Mode = fp->mesa_program.FogOption;
+               r300Fogfv(ctx, GL_FOG_MODE, NULL);
+       } else
+               /* Make sure HW is matching GL context. */
+               r300SetFogState(ctx, ctx->Fog.Enabled);
 
        r300SetupTextures(ctx);
 
        R300_STATECHANGE(rmesa, fp);
-       rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = fp->max_temp_idx;
+       rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
 
        rmesa->hw.fp.cmd[R500_FP_CODE_ADDR] =
-           R500_US_CODE_START_ADDR(fp->inst_offset) |
-           R500_US_CODE_END_ADDR(fp->inst_end);
+           R500_US_CODE_START_ADDR(code->inst_offset) |
+           R500_US_CODE_END_ADDR(code->inst_end);
        rmesa->hw.fp.cmd[R500_FP_CODE_RANGE] =
-           R500_US_CODE_RANGE_ADDR(fp->inst_offset) |
-           R500_US_CODE_RANGE_SIZE(fp->inst_end);
+           R500_US_CODE_RANGE_ADDR(code->inst_offset) |
+           R500_US_CODE_RANGE_SIZE(code->inst_end);
        rmesa->hw.fp.cmd[R500_FP_CODE_OFFSET] =
            R500_US_CODE_OFFSET_ADDR(0); /* FIXME when we add flow control */
 
        R300_STATECHANGE(rmesa, r500fp);
        /* Emit our shader... */
-       for (i = 0; i < fp->inst_end+1; i++) {
-               rmesa->hw.r500fp.cmd[i*6+1] = fp->inst[i].inst0;
-               rmesa->hw.r500fp.cmd[i*6+2] = fp->inst[i].inst1;
-               rmesa->hw.r500fp.cmd[i*6+3] = fp->inst[i].inst2;
-               rmesa->hw.r500fp.cmd[i*6+4] = fp->inst[i].inst3;
-               rmesa->hw.r500fp.cmd[i*6+5] = fp->inst[i].inst4;
-               rmesa->hw.r500fp.cmd[i*6+6] = fp->inst[i].inst5;
+       for (i = 0; i < code->inst_end+1; i++) {
+               rmesa->hw.r500fp.cmd[i*6+1] = code->inst[i].inst0;
+               rmesa->hw.r500fp.cmd[i*6+2] = code->inst[i].inst1;
+               rmesa->hw.r500fp.cmd[i*6+3] = code->inst[i].inst2;
+               rmesa->hw.r500fp.cmd[i*6+4] = code->inst[i].inst3;
+               rmesa->hw.r500fp.cmd[i*6+5] = code->inst[i].inst4;
+               rmesa->hw.r500fp.cmd[i*6+6] = code->inst[i].inst5;
        }
 
-       bump_r500fp_count(rmesa->hw.r500fp.cmd, (fp->inst_end + 1) * 6);
+       bump_r500fp_count(rmesa->hw.r500fp.cmd, (code->inst_end + 1) * 6);
 
        R300_STATECHANGE(rmesa, r500fp_const);
-       for (i = 0; i < fp->const_nr; i++) {
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(fp->constant[i][0]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(fp->constant[i][1]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(fp->constant[i][2]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(fp->constant[i][3]);
+       for (i = 0; i < code->const_nr; i++) {
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
        }
-       bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, fp->const_nr * 4);
+       bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
 
 }
 
@@ -2685,6 +2809,10 @@ void r300InitStateFuncs(struct dd_function_table *functions)
        functions->Fogfv = r300Fogfv;
        functions->FrontFace = r300FrontFace;
        functions->ShadeModel = r300ShadeModel;
+       functions->LogicOpcode = r300LogicOpcode;
+
+       /* ARB_point_parameters */
+       functions->PointParameterfv = r300PointParameter;
 
        /* Stencil related */
        functions->StencilFuncSeparate = r300StencilFuncSeparate;