r300: rewrite and hopefully simplify RS setup
[mesa.git] / src / mesa / drivers / dri / r300 / r300_state.c
index cb39ce01106a4b8bf77448a52b62b468869170d7..171a1549188b9b240b37069a08ef1b1fe383cd10 100644 (file)
@@ -35,23 +35,23 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \author Nicolai Haehnle <prefect_@gmx.net>
  */
 
-#include "glheader.h"
-#include "state.h"
-#include "imports.h"
-#include "enums.h"
-#include "macros.h"
-#include "context.h"
-#include "dd.h"
-#include "simple_list.h"
-
-#include "api_arrayelt.h"
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/dd.h"
+#include "main/simple_list.h"
+#include "main/api_arrayelt.h"
+#include "main/texformat.h"
+
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
 #include "shader/prog_parameter.h"
 #include "shader/prog_statevars.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
-#include "texformat.h"
 
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
@@ -60,6 +60,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_emit.h"
+#include "r300_fragprog.h"
 #include "r300_tex.h"
 
 #include "drirenderbuffer.h"
@@ -69,20 +70,28 @@ extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
 
 static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
 {
-       GLubyte color[4];
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
        R300_STATECHANGE(rmesa, blend_color);
 
-       CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
-       CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
-       CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
-       CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+               GLuint r = IROUND(cf[0]*1023.0f);
+               GLuint g = IROUND(cf[1]*1023.0f);
+               GLuint b = IROUND(cf[2]*1023.0f);
+               GLuint a = IROUND(cf[3]*1023.0f);
+
+               rmesa->hw.blend_color.cmd[1] = r | (a << 16);
+               rmesa->hw.blend_color.cmd[2] = b | (g << 16);
+       } else {
+               GLubyte color[4];
+               CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
+               CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
+               CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
+               CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
 
-       rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0],
-                                                      color[1], color[2]);
-       rmesa->hw.blend_color.cmd[2] = 0;
-       rmesa->hw.blend_color.cmd[3] = 0;
+               rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0],
+                                                       color[1], color[2]);
+       }
 }
 
 /**
@@ -312,6 +321,44 @@ static void r300BlendFuncSeparate(GLcontext * ctx,
        r300SetBlendState(ctx);
 }
 
+/**
+ * Translate LogicOp enums into hardware representation.
+ * Both use a very logical bit-wise layout, but unfortunately the order
+ * of bits is reversed.
+ */
+static GLuint translate_logicop(GLenum logicop)
+{
+       GLuint bits = logicop - GL_CLEAR;
+       bits = ((bits & 1) << 3) | ((bits & 2) << 1) | ((bits & 4) >> 1) | ((bits & 8) >> 3);
+       return bits << R300_RB3D_ROPCNTL_ROP_SHIFT;
+}
+
+/**
+ * Used internally to update the r300->hw hardware state to match the
+ * current OpenGL state.
+ */
+static void r300SetLogicOpState(GLcontext *ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       R300_STATECHANGE(r300, rop);
+       if (RGBA_LOGICOP_ENABLED(ctx)) {
+               r300->hw.rop.cmd[1] = R300_RB3D_ROPCNTL_ROP_ENABLE |
+                       translate_logicop(ctx->Color.LogicOp);
+       } else {
+               r300->hw.rop.cmd[1] = 0;
+       }
+}
+
+/**
+ * Called by Mesa when an application program changes the LogicOp state
+ * via glLogicOp.
+ */
+static void r300LogicOpcode(GLcontext *ctx, GLenum logicop)
+{
+       if (RGBA_LOGICOP_ENABLED(ctx))
+               r300SetLogicOpState(ctx);
+}
+
 static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
@@ -402,42 +449,40 @@ static void r300SetPolygonOffsetState(GLcontext * ctx, GLboolean state)
        }
 }
 
-static void r300SetEarlyZState(GLcontext * ctx)
+static GLboolean current_fragment_program_writes_depth(GLcontext* ctx)
 {
-       /* updates register R300_RB3D_EARLY_Z (0x4F14)
-          if depth test is not enabled it should be R300_EARLY_Z_DISABLE
-          if depth is enabled and alpha not it should be R300_EARLY_Z_ENABLE
-          if depth and alpha is enabled it should be R300_EARLY_Z_DISABLE
-        */
        r300ContextPtr r300 = R300_CONTEXT(ctx);
 
-       R300_STATECHANGE(r300, zstencil_format);
-       switch (ctx->Visual.depthBits) {
-       case 16:
-               r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_16BIT_INT_Z;
-               break;
-       case 24:
-               r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-               break;
-       default:
-               fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
-               _mesa_exit(-1);
+       if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+               struct r300_fragment_program *fp = (struct r300_fragment_program *)
+                       (char *)ctx->FragmentProgram._Current;
+               return (fp && fp->WritesDepth);
+       } else {
+               struct r500_fragment_program* fp =
+                       (struct r500_fragment_program*)(char*)
+                       ctx->FragmentProgram._Current;
+               return (fp && fp->writes_depth);
        }
+}
+
+static void r300SetEarlyZState(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       GLuint topZ = R300_ZTOP_ENABLE;
 
        if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
-               /* disable early Z */
-               r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
-       else {
-               if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER)
-                       /* enable early Z */
-                       r300->hw.zstencil_format.cmd[2] = R300_ZTOP_ENABLE;
-               else
-                       /* disable early Z */
-                       r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
+               topZ = R300_ZTOP_DISABLE;
+       if (current_fragment_program_writes_depth(ctx))
+               topZ = R300_ZTOP_DISABLE;
+
+       if (topZ != r300->hw.zstencil_format.cmd[2]) {
+               /* Note: This completely reemits the stencil format.
+                * I have not tested whether this is strictly necessary,
+                * or if emitting a write to ZB_ZTOP is enough.
+                */
+               R300_STATECHANGE(r300, zstencil_format);
+               r300->hw.zstencil_format.cmd[2] = topZ;
        }
-
-       r300->hw.zstencil_format.cmd[3] = 0x00000003;
-       r300->hw.zstencil_format.cmd[4] = 0x00000000;
 }
 
 static void r300SetAlphaState(GLcontext * ctx)
@@ -479,6 +524,7 @@ static void r300SetAlphaState(GLcontext * ctx)
 
        if (really_enabled) {
                pp_misc |= R300_FG_ALPHA_FUNC_ENABLE;
+               pp_misc |= R500_FG_ALPHA_FUNC_8BIT;
                pp_misc |= (refByte & R300_FG_ALPHA_FUNC_VAL_MASK);
        } else {
                pp_misc = 0x0;
@@ -526,24 +572,15 @@ static void r300SetDepthState(GLcontext * ctx)
        r300ContextPtr r300 = R300_CONTEXT(ctx);
 
        R300_STATECHANGE(r300, zs);
-       r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE; // XXX
-       r300->hw.zs.cmd[R300_ZS_CNTL_1] &=
-           ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
+       r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE|R300_STENCIL_FRONT_BACK;
+       r300->hw.zs.cmd[R300_ZS_CNTL_1] &= ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
 
-       if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER) {
+       if (ctx->Depth.Test) {
+               r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE;
                if (ctx->Depth.Mask)
-                       r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
-                           R300_Z_ENABLE | R300_Z_WRITE_ENABLE | R300_STENCIL_FRONT_BACK; // XXX
-               else
-                   r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE | R300_STENCIL_FRONT_BACK; // XXX
-
-               r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   translate_func(ctx->Depth.
-                                  Func) << R300_Z_FUNC_SHIFT;
-       } else {
-           r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK; // XXX
+                       r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_WRITE_ENABLE;
                r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   translate_func(GL_NEVER) << R300_Z_FUNC_SHIFT;
+                   translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
        }
 
        r300SetEarlyZState(ctx);
@@ -710,8 +747,6 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
 
        switch (pname) {
        case GL_FOG_MODE:
-               if (!ctx->Fog.Enabled)
-                       return;
                switch (ctx->Fog.Mode) {
                case GL_LINEAR:
                        R300_STATECHANGE(r300, fogs);
@@ -748,6 +783,7 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
                            R300_FG_FOG_BLEND_FN_EXP2;
                        fogScale.f = 0.3 * ctx->Fog.Density;
                        fogStart.f = 0.0;
+                        break;
                default:
                        return;
                }
@@ -836,6 +872,31 @@ static void r300PointSize(GLcontext * ctx, GLfloat size)
            ((int)(size * 6) << R300_POINTSIZE_Y_SHIFT);
 }
 
+static void r300PointParameter(GLcontext * ctx, GLenum pname, const GLfloat * param)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       switch (pname) {
+       case GL_POINT_SIZE_MIN:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MIN_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MinSize * 6.0);
+               break;
+       case GL_POINT_SIZE_MAX:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MAX_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MaxSize * 6.0)
+                       << R300_GA_POINT_MINMAX_MAX_SHIFT;
+               break;
+       case GL_POINT_DISTANCE_ATTENUATION:
+               break;
+       case GL_POINT_FADE_THRESHOLD_SIZE:
+               break;
+       default:
+               break;
+       }
+}
+
 /* =============================================================
  * Line state
  */
@@ -914,19 +975,13 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        GLuint refmask =
-           (((ctx->Stencil.
-              Ref[0] & 0xff) << R300_STENCILREF_SHIFT) | ((ctx->
-                                                           Stencil.
-                                                           ValueMask
-                                                           [0] &
-                                                           0xff)
-                                                          <<
-                                                          R300_STENCILMASK_SHIFT));
-
+           ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
+            | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
+       const unsigned back = ctx->Stencil._BackFace;
        GLuint flag;
 
        R300_STATECHANGE(rmesa, zs);
-
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &= ~((R300_ZS_MASK <<
                                               R300_S_FRONT_FUNC_SHIFT)
                                              | (R300_ZS_MASK <<
@@ -940,8 +995,7 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
            (flag << R300_S_FRONT_FUNC_SHIFT);
 
-       if (ctx->Stencil._TestTwoSide)
-               flag = translate_func(ctx->Stencil.Function[1]);
+       flag = translate_func(ctx->Stencil.Function[back]);
 
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
            (flag << R300_S_BACK_FUNC_SHIFT);
@@ -966,6 +1020,7 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
                                  GLenum fail, GLenum zfail, GLenum zpass)
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       const unsigned back = ctx->Stencil._BackFace;
 
        R300_STATECHANGE(rmesa, zs);
        /* It is easier to mask what's left.. */
@@ -982,34 +1037,13 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
            | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
               R300_S_FRONT_ZPASS_OP_SHIFT);
 
-       if (ctx->Stencil._TestTwoSide) {
-               rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   (translate_stencil_op(ctx->Stencil.FailFunc[1]) <<
-                    R300_S_BACK_SFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZFailFunc[1]) <<
-                      R300_S_BACK_ZFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZPassFunc[1]) <<
-                      R300_S_BACK_ZPASS_OP_SHIFT);
-       } else {
-               rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
-                    R300_S_BACK_SFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
-                      R300_S_BACK_ZFAIL_OP_SHIFT)
-                   | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
-                      R300_S_BACK_ZPASS_OP_SHIFT);
-       }
-}
-
-static void r300ClearStencil(GLcontext * ctx, GLint s)
-{
-       r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-       rmesa->state.stencil.clear =
-           ((GLuint) (ctx->Stencil.Clear & R300_STENCILREF_MASK) |
-            (R300_STENCILREF_MASK << R300_STENCILMASK_SHIFT) |
-            ((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
-               R300_STENCILMASK_SHIFT));
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+           (translate_stencil_op(ctx->Stencil.FailFunc[back]) <<
+            R300_S_BACK_SFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZFailFunc[back]) <<
+              R300_S_BACK_ZFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZPassFunc[back]) <<
+              R300_S_BACK_ZPASS_OP_SHIFT);
 }
 
 /* =============================================================
@@ -1165,7 +1199,7 @@ r300FetchStateParameter(GLcontext * ctx,
 
                case STATE_R300_TEXRECT_FACTOR:{
                                struct gl_texture_object *t =
-                                   ctx->Texture.Unit[state[2]].CurrentRect;
+                                   ctx->Texture.Unit[state[2]].CurrentTex[TEXTURE_RECT_INDEX];
 
                                if (t && t->Image[0][t->BaseLevel]) {
                                        struct gl_texture_image *image =
@@ -1276,8 +1310,8 @@ static unsigned long gen_fixed_filter(unsigned long f)
            (R300_TX_CLAMP << R300_TX_WRAP_T_SHIFT)) {
                needs_fixing |= 2;
        }
-       if ((f & ((7 - 1) << R300_TX_WRAP_Q_SHIFT)) ==
-           (R300_TX_CLAMP << R300_TX_WRAP_Q_SHIFT)) {
+       if ((f & ((7 - 1) << R300_TX_WRAP_R_SHIFT)) ==
+           (R300_TX_CLAMP << R300_TX_WRAP_R_SHIFT)) {
                needs_fixing |= 4;
        }
 
@@ -1285,7 +1319,7 @@ static unsigned long gen_fixed_filter(unsigned long f)
                return f;
 
        mag = f & R300_TX_MAG_FILTER_MASK;
-       min = f & R300_TX_MIN_FILTER_MASK;
+       min = f & (R300_TX_MIN_FILTER_MASK|R300_TX_MIN_FILTER_MIP_MASK);
 
        /* TODO: Check for anisto filters too */
        if ((mag != R300_TX_MAG_FILTER_NEAREST)
@@ -1317,8 +1351,8 @@ static unsigned long gen_fixed_filter(unsigned long f)
                f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_T_SHIFT;
        }
        if (needs_fixing & 4) {
-               f &= ~((7 - 1) << R300_TX_WRAP_Q_SHIFT);
-               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_Q_SHIFT;
+               f &= ~((7 - 1) << R300_TX_WRAP_R_SHIFT);
+               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_R_SHIFT;
        }
        return f;
 }
@@ -1329,20 +1363,21 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
        int i;
        struct r300_fragment_program *fp = (struct r300_fragment_program *)
            (char *)ctx->FragmentProgram._Current;
+       struct r300_fragment_program_code *code = &fp->code;
 
        R300_STATECHANGE(r300, fpt);
 
-       for (i = 0; i < fp->tex.length; i++) {
+       for (i = 0; i < code->tex.length; i++) {
                int unit;
                int opcode;
                unsigned long val;
-                       
-               unit = fp->tex.inst[i] >> R300_TEX_ID_SHIFT;
+
+               unit = code->tex.inst[i] >> R300_TEX_ID_SHIFT;
                unit &= 15;
-                       
-               val = fp->tex.inst[i];
+
+               val = code->tex.inst[i];
                val &= ~R300_TEX_ID_MASK;
-                       
+
                opcode =
                        (val & R300_TEX_INST_MASK) >> R300_TEX_INST_SHIFT;
                if (opcode == R300_TEX_OP_KIL) {
@@ -1360,10 +1395,9 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
                        }
                }
        }
-       
+
        r300->hw.fpt.cmd[R300_FPT_CMD_0] =
-               cmdpacket0(R300_US_TEX_INST_0, fp->tex.length);
-       
+               cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
 }
 
 static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
@@ -1371,19 +1405,20 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
        int i;
        struct r500_fragment_program *fp = (struct r500_fragment_program *)
            (char *)ctx->FragmentProgram._Current;
+       struct r500_fragment_program_code *code = &fp->code;
 
        /* find all the texture instructions and relocate the texture units */
-       for (i = 0; i < fp->inst_end + 1; i++) {
-               if ((fp->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
+       for (i = 0; i < code->inst_end + 1; i++) {
+               if ((code->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
                        uint32_t val;
                        int unit, opcode, new_unit;
 
-                       val = fp->inst[i].inst1;
+                       val = code->inst[i].inst1;
 
                        unit = (val >> 16) & 0xf;
 
                        val &= ~(0xf << 16);
-                       
+
                        opcode = val & (0x7 << 22);
                        if (opcode == R500_TEX_INST_TEXKILL) {
                                new_unit = 0;
@@ -1395,11 +1430,21 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
                                }
                        }
                        val |= R500_TEX_ID(new_unit);
-                       fp->inst[i].inst1 = val;
+                       code->inst[i].inst1 = val;
                }
        }
 }
 
+static GLuint translate_lod_bias(GLfloat bias)
+{
+       GLint b = (int)(bias*32);
+       if (b >= (1 << 9))
+               b = (1 << 9)-1;
+       else if (b < -(1 << 9))
+               b = -(1 << 9);
+       return (((GLuint)b) << R300_LOD_BIAS_SHIFT) & R300_LOD_BIAS_MASK;
+}
+
 static void r300SetupTextures(GLcontext * ctx)
 {
        int i, mtu;
@@ -1463,8 +1508,14 @@ static void r300SetupTextures(GLcontext * ctx)
                        r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
                                                hw_tmu] =
                            gen_fixed_filter(t->filter) | (hw_tmu << 28);
-                       /* Currently disabled! */
-                       r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = 0x0;     //0x20501f80;
+                       /* Note: There is a LOD bias per texture unit and a LOD bias
+                        * per texture object. We add them here to get the correct behaviour.
+                        * (The per-texture object LOD bias was introduced in OpenGL 1.4
+                        * and is not present in the EXT_texture_object extension).
+                        */
+                       r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+                               t->filter_1 |
+                               translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
                        r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
                            t->size;
                        r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
@@ -1514,10 +1565,17 @@ static void r300SetupTextures(GLcontext * ctx)
        if (!fp)                /* should only happenen once, just after context is created */
                return;
 
-
-        if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515)
+       if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+               if (fp->mesa_program.UsesKill && last_hw_tmu < 0) {
+                       // The KILL operation requires the first texture unit
+                       // to be enabled.
+                       r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
+                       r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
+                       r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+                               cmdpacket0(R300_TX_FILTER0_0, 1);
+               }
                r300SetupFragmentShaderTextures(ctx, tmu_mappings);
-       else 
+       } else
                r500SetupFragmentShaderTextures(ctx, tmu_mappings);
 
        if (RADEON_DEBUG & DEBUG_STATE)
@@ -1537,18 +1595,14 @@ union r300_outputs_written {
 static void r300SetupRSUnit(GLcontext * ctx)
 {
        r300ContextPtr r300 = R300_CONTEXT(ctx);
-       /* I'm still unsure if these are needed */
-       GLuint interp_col[8];
         TNLcontext *tnl = TNL_CONTEXT(ctx);
        struct vertex_buffer *VB = &tnl->vb;
        union r300_outputs_written OutputsWritten;
        GLuint InputsRead;
        int fp_reg, high_rr;
-       int col_interp_nr;
-       int rs_tex_count = 0, rs_col_count = 0;
-       int i, count;
-
-       memset(interp_col, 0, sizeof(interp_col));
+       int col_ip, tex_ip;
+       int rs_tex_count = 0;
+       int i, count, col_fmt;
 
        if (hw_tcl_on)
                OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
@@ -1566,44 +1620,66 @@ static void r300SetupRSUnit(GLcontext * ctx)
        R300_STATECHANGE(r300, rc);
        R300_STATECHANGE(r300, rr);
 
-       fp_reg = col_interp_nr = high_rr = 0;
+       fp_reg = col_ip = tex_ip = col_fmt = 0;
 
-       r300->hw.rr.cmd[R300_RR_INST_1] = 0;
+       r300->hw.rc.cmd[1] = 0;
+       r300->hw.rc.cmd[2] = 0;
+       for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+               r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
 
-       if (InputsRead & FRAG_BIT_WPOS) {
-               for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-                       if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-                               break;
+       for (i=0; i<R300_RI_CMDSIZE-1; ++i)
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
 
-               if (i == ctx->Const.MaxTextureUnits) {
-                       fprintf(stderr, "\tno free texcoord found...\n");
-                       _mesa_exit(-1);
-               }
-
-               InputsRead |= (FRAG_BIT_TEX0 << i);
-               InputsRead &= ~FRAG_BIT_WPOS;
-       }
 
        if (InputsRead & FRAG_BIT_COL0) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-               interp_col[0] |= R300_RS_COL_PTR(rs_col_count);
-               if (count == 3)
-                       interp_col[0] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL0;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+               }
        }
-       else
-               interp_col[0] = R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
 
        if (InputsRead & FRAG_BIT_COL1) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-               if (count == 3)
-                       interp_col[1] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
-               interp_col[1] |= R300_RS_COL_PTR(1);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL1;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+               }
        }
 
-
        for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+               if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+                   continue;
+
+               if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+                   WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+                   continue;
+               }
+
                int swiz;
 
                /* with TCL we always seem to route 4 components */
@@ -1612,7 +1688,6 @@ static void r300SetupRSUnit(GLcontext * ctx)
                else
                  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
 
-               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | rs_tex_count;
                switch(count) {
                case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
                case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
@@ -1621,63 +1696,48 @@ static void r300SetupRSUnit(GLcontext * ctx)
                case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
                };
 
-               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] |= swiz;
-               r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
-               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-
-                       rs_tex_count += count;
-
-                       //assert(r300->state.texture.tc_count != 0);
-                       r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R300_RS_INST_TEX_CN_WRITE | i       /* source INTERP */
-                           | (fp_reg << R300_RS_INST_TEX_ADDR_SHIFT);
-                       high_rr = fp_reg;
-
-                       /* Passing invalid data here can lock the GPU. */
-                       if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
-                               InputsRead &= ~(FRAG_BIT_TEX0 << i);
-                               fp_reg++;
-                       } else {
-                               WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
-                       }
-               }
-       }
-
-       if (InputsRead & FRAG_BIT_COL0) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-                       r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL0;
-                       col_interp_nr++;
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz | R300_RS_TEX_PTR(rs_tex_count);
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~(FRAG_BIT_TEX0 << i);
+               rs_tex_count += count;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       if (InputsRead & FRAG_BIT_FOGC) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_FOGC;
+                       rs_tex_count += 4;
+                       ++tex_ip;
+                       ++fp_reg;
                } else {
-                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+                       WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
                }
        }
 
-       if (InputsRead & FRAG_BIT_COL1) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-                       r300->hw.rr.cmd[R300_RR_INST_1] |= R300_RS_INST_COL_ID(1) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL1;
-                       if (high_rr < 1)
-                               high_rr = 1;
-                       col_interp_nr++;
-               } else {
-                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
-               }
+       if (InputsRead & FRAG_BIT_WPOS) {
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~FRAG_BIT_WPOS;
+               rs_tex_count += 4;
+               ++tex_ip;
+               ++fp_reg;
        }
+       InputsRead &= ~FRAG_BIT_WPOS;
 
-       /* Need at least one. This might still lock as the values are undefined... */
-       if (rs_tex_count == 0 && col_interp_nr == 0) {
-               r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
-               col_interp_nr++;
+       /* Setup default color if no color or tex was set */
+       if (rs_tex_count == 0 && col_ip == 0) {
+               r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+               ++col_ip;
        }
 
-       r300->hw.rc.cmd[1] = 0 | (rs_tex_count << R300_IT_COUNT_SHIFT)
-         | (col_interp_nr << R300_IC_COUNT_SHIFT)
-         | R300_HIRES_EN;
+       high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+       r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+       r300->hw.rc.cmd[2] |= high_rr - 1;
 
-       assert(high_rr >= 0);
-       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr + 1);
-       r300->hw.rc.cmd[2] = high_rr;
+       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr);
 
        if (InputsRead)
                WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1686,18 +1746,15 @@ static void r300SetupRSUnit(GLcontext * ctx)
 static void r500SetupRSUnit(GLcontext * ctx)
 {
        r300ContextPtr r300 = R300_CONTEXT(ctx);
-       /* I'm still unsure if these are needed */
-       GLuint interp_col[8];
-       union r300_outputs_written OutputsWritten;
         TNLcontext *tnl = TNL_CONTEXT(ctx);
        struct vertex_buffer *VB = &tnl->vb;
+       union r300_outputs_written OutputsWritten;
        GLuint InputsRead;
        int fp_reg, high_rr;
-       int rs_col_count = 0;
-       int in_texcoords, col_interp_nr;
-       int i, count;
+       int col_ip, tex_ip;
+       int rs_tex_count = 0;
+       int i, count, col_fmt;
 
-       memset(interp_col, 0, sizeof(interp_col));
        if (hw_tcl_on)
                OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
        else
@@ -1714,130 +1771,151 @@ static void r500SetupRSUnit(GLcontext * ctx)
        R300_STATECHANGE(r300, rc);
        R300_STATECHANGE(r300, rr);
 
-       fp_reg = col_interp_nr = high_rr = in_texcoords = 0;
+       fp_reg = col_ip = tex_ip = col_fmt = 0;
 
-       r300->hw.rr.cmd[R300_RR_INST_1] = 0;
+       r300->hw.rc.cmd[1] = 0;
+       r300->hw.rc.cmd[2] = 0;
+       for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+               r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
 
-       if (InputsRead & FRAG_BIT_WPOS) {
-               for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-                       if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-                               break;
+       for (i=0; i<R500_RI_CMDSIZE-1; ++i)
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
 
-               if (i == ctx->Const.MaxTextureUnits) {
-                       fprintf(stderr, "\tno free texcoord found...\n");
-                       _mesa_exit(-1);
-               }
-
-               InputsRead |= (FRAG_BIT_TEX0 << i);
-               InputsRead &= ~FRAG_BIT_WPOS;
-       }
 
        if (InputsRead & FRAG_BIT_COL0) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-               interp_col[0] |= R500_RS_COL_PTR(rs_col_count);
-               if (count == 3)
-                       interp_col[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL0;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+               }
        }
-       else
-               interp_col[0] = R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
 
        if (InputsRead & FRAG_BIT_COL1) {
-               count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-               interp_col[1] |= R500_RS_COL_PTR(1);
-               if (count == 3)
-                       interp_col[1] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
-               rs_col_count += count;
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL1;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+               }
        }
 
+
        for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-               GLuint swiz = 0;
+               if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+                   continue;
 
-               /* with TCL we always seem to route 4 components */
-               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-                 
-                 if (hw_tcl_on)
-                   count = 4;
-                 else
-                   count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-                 
-                 /* always have on texcoord */
-                 swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_S_SHIFT;
-                 if (count >= 2) 
-                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_T_SHIFT;
-                 else
-                   swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-
-                 if (count >= 3)
-                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_R_SHIFT;
-                 else
-                   swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-
-                 if (count == 4) 
-                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_Q_SHIFT;
-                 else
-                   swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-                 
-               } else
-                  swiz = (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-                         (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-                         (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-                         (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
-
-               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | swiz;
-
-               r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
-               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-                       //assert(r300->state.texture.tc_count != 0);
-                       r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R500_RS_INST_TEX_CN_WRITE | i       /* source INTERP */
-                           | (fp_reg << R500_RS_INST_TEX_ADDR_SHIFT);
-                       high_rr = fp_reg;
-
-                       /* Passing invalid data here can lock the GPU. */
-                       if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
-                               InputsRead &= ~(FRAG_BIT_TEX0 << i);
-                               fp_reg++;
-                       } else {
-                               WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
-                       }
+               if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+                   WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+                   continue;
                }
-       }
 
-       if (InputsRead & FRAG_BIT_COL0) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-                       r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL0;
-                       col_interp_nr++;
+               int swiz = 0;
+
+               /* with TCL we always seem to route 4 components */
+               if (hw_tcl_on)
+                 count = 4;
+               else
+                 count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
+
+               if (count == 4) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= (rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 3) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 2) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 1) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
                } else {
-                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
                }
-       }
 
-       if (InputsRead & FRAG_BIT_COL1) {
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-                       r300->hw.rr.cmd[R300_RR_INST_1] |= (1 << 12) | R500_RS_INST_COL_CN_WRITE |  (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
-                       InputsRead &= ~FRAG_BIT_COL1;
-                       if (high_rr < 1)
-                               high_rr = 1;
-                       col_interp_nr++;
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz;
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~(FRAG_BIT_TEX0 << i);
+               rs_tex_count += count;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       if (InputsRead & FRAG_BIT_FOGC) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                               ((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                               ((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                               ((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+                       r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_FOGC;
+                       rs_tex_count += 4;
+                       ++tex_ip;
+                       ++fp_reg;
                } else {
-                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+                       WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
                }
        }
 
-       /* Need at least one. This might still lock as the values are undefined... */
-       if (in_texcoords == 0 && col_interp_nr == 0) {
-               r300->hw.rr.cmd[R300_RR_INST_0] |= 0 | R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
-               col_interp_nr++;
+       if (InputsRead & FRAG_BIT_WPOS) {
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                               ((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                               ((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                               ((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~FRAG_BIT_WPOS;
+               rs_tex_count += 4;
+               ++tex_ip;
+               ++fp_reg;
        }
 
-       r300->hw.rc.cmd[1] = 0 | (in_texcoords << R300_IT_COUNT_SHIFT)
-         | (col_interp_nr << R300_IC_COUNT_SHIFT)
-         | R300_HIRES_EN;
+       /* Setup default color if no color or tex was set */
+       if (rs_tex_count == 0 && col_ip == 0) {
+               r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+               ++col_ip;
+       }
 
-       assert(high_rr >= 0);
-       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr + 1);
-       r300->hw.rc.cmd[2] = 0xC0 | high_rr;
+       high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+       r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+       r300->hw.rc.cmd[2] |= 0xC0 | (high_rr - 1);
+
+       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr);
 
        if (InputsRead)
                WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1853,7 +1931,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
        if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
        }while(0)
 
-static inline void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
+static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
 {
        int i;
 
@@ -1919,7 +1997,7 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 
     R300_STATECHANGE(rmesa, vap_cntl);
     if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
-       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] = 
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] =
            (pvs_num_slots << R300_PVS_NUM_SLOTS_SHIFT) |
            (pvs_num_cntrls << R300_PVS_NUM_CNTLRS_SHIFT) |
            (12 << R300_VF_MAX_VTX_NUM_SHIFT);
@@ -1934,13 +2012,14 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
     if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (2 << R300_PVS_NUM_FPUS_SHIFT);
     else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
-            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560))
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (5 << R300_PVS_NUM_FPUS_SHIFT);
-    else if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420)
+    else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (6 << R300_PVS_NUM_FPUS_SHIFT);
     else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
-            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580) ||
-            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (8 << R300_PVS_NUM_FPUS_SHIFT);
     else
        rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (4 << R300_PVS_NUM_FPUS_SHIFT);
@@ -2079,8 +2158,10 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
        case GL_ALPHA_TEST:
                r300SetAlphaState(ctx);
                break;
-       case GL_BLEND:
        case GL_COLOR_LOGIC_OP:
+               r300SetLogicOpState(ctx);
+               /* fall-through, because logic op overrides blending */
+       case GL_BLEND:
                r300SetBlendState(ctx);
                break;
        case GL_CLIP_PLANE0:
@@ -2137,7 +2218,7 @@ static void r300ResetHwState(r300ContextPtr r300)
        r300DepthFunc(ctx, ctx->Depth.Func);
 
        /* stencil */
-       r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+       r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
        r300StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
        r300StencilFuncSeparate(ctx, 0, ctx->Stencil.Function[0],
                                ctx->Stencil.Ref[0], ctx->Stencil.ValueMask[0]);
@@ -2150,6 +2231,7 @@ static void r300ResetHwState(r300ContextPtr r300)
        r300UpdateTextureState(ctx);
 
        r300SetBlendState(ctx);
+       r300SetLogicOpState(ctx);
 
        r300AlphaFunc(ctx, ctx->Color.AlphaFunc, ctx->Color.AlphaRef);
        r300Enable(ctx, GL_ALPHA_TEST, ctx->Color.AlphaEnabled);
@@ -2339,6 +2421,23 @@ static void r300ResetHwState(r300ContextPtr r300)
 
        r300->hw.zb_depthclearvalue.cmd[1] = 0;
 
+       switch (ctx->Visual.depthBits) {
+       case 16:
+               r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_16BIT_INT_Z;
+               break;
+       case 24:
+               r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+               break;
+       default:
+               fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
+               _mesa_exit(-1);
+       }
+
+       r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
+       r300->hw.zstencil_format.cmd[3] = 0x00000003;
+       r300->hw.zstencil_format.cmd[4] = 0x00000000;
+       r300SetEarlyZState(ctx);
+
        r300->hw.unk4F30.cmd[1] = 0;
        r300->hw.unk4F30.cmd[2] = 0;
 
@@ -2399,11 +2498,33 @@ void r300UpdateShaders(r300ContextPtr rmesa)
        r300UpdateStateParameters(ctx, _NEW_PROGRAM);
 }
 
+static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
+       struct gl_program *program, struct prog_src_register srcreg)
+{
+       static const GLfloat dummy[4] = { 0, 0, 0, 0 };
+
+       switch(srcreg.File) {
+       case PROGRAM_LOCAL_PARAM:
+               return program->LocalParams[srcreg.Index];
+       case PROGRAM_ENV_PARAM:
+               return ctx->FragmentProgram.Parameters[srcreg.Index];
+       case PROGRAM_STATE_VAR:
+       case PROGRAM_NAMED_PARAM:
+       case PROGRAM_CONSTANT:
+               return program->Parameters->ParameterValues[srcreg.Index];
+       default:
+               _mesa_problem(ctx, "get_fragmentprogram_constant: Unknown\n");
+               return dummy;
+       }
+}
+
+
 static void r300SetupPixelShader(r300ContextPtr rmesa)
 {
        GLcontext *ctx = rmesa->radeon.glCtx;
        struct r300_fragment_program *fp = (struct r300_fragment_program *)
            (char *)ctx->FragmentProgram._Current;
+       struct r300_fragment_program_code *code;
        int i, k;
 
        if (!fp)                /* should only happenen once, just after context is created */
@@ -2415,62 +2536,56 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
                        __FUNCTION__);
                return;
        }
+       code = &fp->code;
 
        r300SetupTextures(ctx);
 
        R300_STATECHANGE(rmesa, fpi[0]);
-       rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst0;
-       }
-
        R300_STATECHANGE(rmesa, fpi[1]);
-       rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst1;
-       }
-
        R300_STATECHANGE(rmesa, fpi[2]);
-       rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst2;
-       }
-
        R300_STATECHANGE(rmesa, fpi[3]);
-       rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst3;
+       rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu.length);
+       rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu.length);
+       rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu.length);
+       rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+       for (i = 0; i < code->alu.length; i++) {
+               rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
+               rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
+               rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
+               rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3;
        }
 
        R300_STATECHANGE(rmesa, fp);
-       rmesa->hw.fp.cmd[R300_FP_CNTL0] = fp->cur_node | (fp->first_node_has_tex << 3);
-       rmesa->hw.fp.cmd[R300_FP_CNTL1] = fp->max_temp_idx;
+       rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3);
+       rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx;
        rmesa->hw.fp.cmd[R300_FP_CNTL2] =
-         (fp->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
-         (fp->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
-         (fp->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
-         (fp->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
+         (0 << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
+         ((code->alu.length-1) << R300_PFS_CNTL_ALU_END_SHIFT) |
+         (0 << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
+         ((code->tex.length ? code->tex.length-1 : 0) << R300_PFS_CNTL_TEX_END_SHIFT);
        /* I just want to say, the way these nodes are stored.. weird.. */
-       for (i = 0, k = (4 - (fp->cur_node + 1)); i < 4; i++, k++) {
-               if (i < (fp->cur_node + 1)) {
+       for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) {
+               if (i < (code->cur_node + 1)) {
                        rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
-                         (fp->node[i].alu_offset << R300_ALU_START_SHIFT) |
-                         (fp->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
-                         (fp->node[i].tex_offset << R300_TEX_START_SHIFT) |
-                         (fp->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
-                         fp->node[i].flags;
+                         (code->node[i].alu_offset << R300_ALU_START_SHIFT) |
+                         (code->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
+                         (code->node[i].tex_offset << R300_TEX_START_SHIFT) |
+                         (code->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
+                         code->node[i].flags;
                } else {
                        rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
                }
        }
 
        R300_STATECHANGE(rmesa, fpp);
-       rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, fp->const_nr * 4);
-       for (i = 0; i < fp->const_nr; i++) {
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(fp->constant[i][0]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(fp->constant[i][1]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(fp->constant[i][2]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(fp->constant[i][3]);
+       rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
+       for (i = 0; i < code->const_nr; i++) {
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(constant[3]);
        }
 }
 
@@ -2494,6 +2609,7 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
        struct r500_fragment_program *fp = (struct r500_fragment_program *)
            (char *)ctx->FragmentProgram._Current;
        int i;
+       struct r500_fragment_program_code *code;
 
        if (!fp)                /* should only happenen once, just after context is created */
                return;
@@ -2507,42 +2623,55 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
                        __FUNCTION__);
                return;
        }
+       code = &fp->code;
+
+       if (fp->mesa_program.FogOption != GL_NONE) {
+               /* Enable HW fog. Try not to squish GL context.
+                * (Anybody sane remembered to set glFog() opts first!) */
+               r300SetFogState(ctx, GL_TRUE);
+               ctx->Fog.Mode = fp->mesa_program.FogOption;
+               r300Fogfv(ctx, GL_FOG_MODE, NULL);
+       } else
+               /* Make sure HW is matching GL context. */
+               r300SetFogState(ctx, ctx->Fog.Enabled);
 
        r300SetupTextures(ctx);
 
        R300_STATECHANGE(rmesa, fp);
-       rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = fp->max_temp_idx;
+       rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
 
        rmesa->hw.fp.cmd[R500_FP_CODE_ADDR] =
-           R500_US_CODE_START_ADDR(fp->inst_offset) |
-           R500_US_CODE_END_ADDR(fp->inst_end);
+           R500_US_CODE_START_ADDR(code->inst_offset) |
+           R500_US_CODE_END_ADDR(code->inst_end);
        rmesa->hw.fp.cmd[R500_FP_CODE_RANGE] =
-           R500_US_CODE_RANGE_ADDR(fp->inst_offset) |
-           R500_US_CODE_RANGE_SIZE(fp->inst_end);
+           R500_US_CODE_RANGE_ADDR(code->inst_offset) |
+           R500_US_CODE_RANGE_SIZE(code->inst_end);
        rmesa->hw.fp.cmd[R500_FP_CODE_OFFSET] =
            R500_US_CODE_OFFSET_ADDR(0); /* FIXME when we add flow control */
 
        R300_STATECHANGE(rmesa, r500fp);
        /* Emit our shader... */
-       for (i = 0; i < fp->inst_end+1; i++) {
-               rmesa->hw.r500fp.cmd[i*6+1] = fp->inst[i].inst0;
-               rmesa->hw.r500fp.cmd[i*6+2] = fp->inst[i].inst1;
-               rmesa->hw.r500fp.cmd[i*6+3] = fp->inst[i].inst2;
-               rmesa->hw.r500fp.cmd[i*6+4] = fp->inst[i].inst3;
-               rmesa->hw.r500fp.cmd[i*6+5] = fp->inst[i].inst4;
-               rmesa->hw.r500fp.cmd[i*6+6] = fp->inst[i].inst5;
+       for (i = 0; i < code->inst_end+1; i++) {
+               rmesa->hw.r500fp.cmd[i*6+1] = code->inst[i].inst0;
+               rmesa->hw.r500fp.cmd[i*6+2] = code->inst[i].inst1;
+               rmesa->hw.r500fp.cmd[i*6+3] = code->inst[i].inst2;
+               rmesa->hw.r500fp.cmd[i*6+4] = code->inst[i].inst3;
+               rmesa->hw.r500fp.cmd[i*6+5] = code->inst[i].inst4;
+               rmesa->hw.r500fp.cmd[i*6+6] = code->inst[i].inst5;
        }
 
-       bump_r500fp_count(rmesa->hw.r500fp.cmd, (fp->inst_end + 1) * 6);
+       bump_r500fp_count(rmesa->hw.r500fp.cmd, (code->inst_end + 1) * 6);
 
        R300_STATECHANGE(rmesa, r500fp_const);
-       for (i = 0; i < fp->const_nr; i++) {
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(fp->constant[i][0]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(fp->constant[i][1]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(fp->constant[i][2]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(fp->constant[i][3]);
+       for (i = 0; i < code->const_nr; i++) {
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
        }
-       bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, fp->const_nr * 4);
+       bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
 
 }
 
@@ -2552,6 +2681,15 @@ void r300UpdateShaderStates(r300ContextPtr rmesa)
        ctx = rmesa->radeon.glCtx;
 
        r300UpdateTextureState(ctx);
+       r300SetEarlyZState(ctx);
+
+       GLuint fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+       if (current_fragment_program_writes_depth(ctx))
+               fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
+       if (fgdepthsrc != rmesa->hw.fg_depth_src.cmd[1]) {
+               R300_STATECHANGE(rmesa, fg_depth_src);
+               rmesa->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+       }
 
        if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
                r500SetupPixelShader(rmesa);
@@ -2606,12 +2744,10 @@ void r300InitState(r300ContextPtr r300)
        case 16:
                r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
                depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
-               r300->state.stencil.clear = 0x00000000;
                break;
        case 24:
                r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
                depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-               r300->state.stencil.clear = 0x00ff0000;
                break;
        default:
                fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
@@ -2639,11 +2775,11 @@ void r300UpdateClipPlanes( GLcontext *ctx )
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        GLuint p;
-       
+
        for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
                if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
                        GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
-                       
+
                        R300_STATECHANGE( rmesa, vpucp[p] );
                        rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
                        rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
@@ -2673,9 +2809,12 @@ void r300InitStateFuncs(struct dd_function_table *functions)
        functions->Fogfv = r300Fogfv;
        functions->FrontFace = r300FrontFace;
        functions->ShadeModel = r300ShadeModel;
+       functions->LogicOpcode = r300LogicOpcode;
+
+       /* ARB_point_parameters */
+       functions->PointParameterfv = r300PointParameter;
 
        /* Stencil related */
-       functions->ClearStencil = r300ClearStencil;
        functions->StencilFuncSeparate = r300StencilFuncSeparate;
        functions->StencilMaskSeparate = r300StencilMaskSeparate;
        functions->StencilOpSeparate = r300StencilOpSeparate;