r3xx/r5xx: Enable ARB_point_parameters.
[mesa.git] / src / mesa / drivers / dri / r300 / r300_state.c
index d10242fa1f71c212424f884b39a89863ba08c816..592ee9ccc1024e473b0c40ef1f9aa9fb59bc3679 100644 (file)
@@ -189,7 +189,7 @@ static void r300SetBlendCntl(r300ContextPtr r300, int func, int eqn,
         */
 #if 0
        if (new_ablend == new_cblend) {
-               new_cblend |= R300_BLEND_NO_SEPARATE;
+               new_cblend |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
        }
 #endif
        new_cblend |= cbits;
@@ -295,7 +295,9 @@ static void r300SetBlendState(GLcontext * ctx)
 
        r300SetBlendCntl(r300,
                         func, eqn,
-                        R300_BLEND_UNKNOWN | R300_BLEND_ENABLE, funcA, eqnA);
+                        (R300_SEPARATE_ALPHA_ENABLE |
+                         R300_READ_ENABLE |
+                         R300_ALPHA_BLEND_ENABLE), funcA, eqnA);
 }
 
 static void r300BlendEquationSeparate(GLcontext * ctx,
@@ -401,42 +403,40 @@ static void r300SetPolygonOffsetState(GLcontext * ctx, GLboolean state)
        }
 }
 
-static void r300SetEarlyZState(GLcontext * ctx)
+static GLboolean current_fragment_program_writes_depth(GLcontext* ctx)
 {
-       /* updates register R300_RB3D_EARLY_Z (0x4F14)
-          if depth test is not enabled it should be R300_EARLY_Z_DISABLE
-          if depth is enabled and alpha not it should be R300_EARLY_Z_ENABLE
-          if depth and alpha is enabled it should be R300_EARLY_Z_DISABLE
-        */
        r300ContextPtr r300 = R300_CONTEXT(ctx);
 
-       R300_STATECHANGE(r300, zstencil_format);
-       switch (ctx->Visual.depthBits) {
-       case 16:
-               r300->hw.zstencil_format.cmd[1] = ZB_FORMAR_DEPTHFORMAT_16BIT_INT_Z;
-               break;
-       case 24:
-               r300->hw.zstencil_format.cmd[1] = ZB_FORMAR_DEPTHFORMAT_24BIT_INT_Z;
-               break;
-       default:
-               fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
-               _mesa_exit(-1);
+       if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+               struct r300_fragment_program *fp = (struct r300_fragment_program *)
+                       (char *)ctx->FragmentProgram._Current;
+               return (fp && fp->WritesDepth);
+       } else {
+               struct r500_fragment_program* fp =
+                       (struct r500_fragment_program*)(char*)
+                       ctx->FragmentProgram._Current;
+               return (fp && fp->writes_depth);
        }
+}
+
+static void r300SetEarlyZState(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       GLuint topZ = R300_ZTOP_ENABLE;
 
        if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
-               /* disable early Z */
-               r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_DISABLE;
-       else {
-               if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER)
-                       /* enable early Z */
-                       r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_ENABLE;
-               else
-                       /* disable early Z */
-                       r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_DISABLE;
+               topZ = R300_ZTOP_DISABLE;
+       if (current_fragment_program_writes_depth(ctx))
+               topZ = R300_ZTOP_DISABLE;
+
+       if (topZ != r300->hw.zstencil_format.cmd[2]) {
+               /* Note: This completely reemits the stencil format.
+                * I have not tested whether this is strictly necessary,
+                * or if emitting a write to ZB_ZTOP is enough.
+                */
+               R300_STATECHANGE(r300, zstencil_format);
+               r300->hw.zstencil_format.cmd[2] = topZ;
        }
-
-       r300->hw.zstencil_format.cmd[3] = 0x00000003;
-       r300->hw.zstencil_format.cmd[4] = 0x00000000;
 }
 
 static void r300SetAlphaState(GLcontext * ctx)
@@ -450,25 +450,25 @@ static void r300SetAlphaState(GLcontext * ctx)
 
        switch (ctx->Color.AlphaFunc) {
        case GL_NEVER:
-               pp_misc |= FG_ALPHA_FUNC_NEVER;
+               pp_misc |= R300_FG_ALPHA_FUNC_NEVER;
                break;
        case GL_LESS:
-               pp_misc |= FG_ALPHA_FUNC_LESS;
+               pp_misc |= R300_FG_ALPHA_FUNC_LESS;
                break;
        case GL_EQUAL:
-               pp_misc |= FG_ALPHA_FUNC_EQUAL;
+               pp_misc |= R300_FG_ALPHA_FUNC_EQUAL;
                break;
        case GL_LEQUAL:
-               pp_misc |= FG_ALPHA_FUNC_LE;
+               pp_misc |= R300_FG_ALPHA_FUNC_LE;
                break;
        case GL_GREATER:
-               pp_misc |= FG_ALPHA_FUNC_GREATER;
+               pp_misc |= R300_FG_ALPHA_FUNC_GREATER;
                break;
        case GL_NOTEQUAL:
-               pp_misc |= FG_ALPHA_FUNC_NOTEQUAL;
+               pp_misc |= R300_FG_ALPHA_FUNC_NOTEQUAL;
                break;
        case GL_GEQUAL:
-               pp_misc |= FG_ALPHA_FUNC_GE;
+               pp_misc |= R300_FG_ALPHA_FUNC_GE;
                break;
        case GL_ALWAYS:
                /*pp_misc |= FG_ALPHA_FUNC_ALWAYS; */
@@ -477,8 +477,9 @@ static void r300SetAlphaState(GLcontext * ctx)
        }
 
        if (really_enabled) {
-               pp_misc |= FG_ALPHA_FUNC_ENABLE;
-               pp_misc |= (refByte & R300_REF_ALPHA_MASK);
+               pp_misc |= R300_FG_ALPHA_FUNC_ENABLE;
+               pp_misc |= R500_FG_ALPHA_FUNC_8BIT;
+               pp_misc |= (refByte & R300_FG_ALPHA_FUNC_VAL_MASK);
        } else {
                pp_misc = 0x0;
        }
@@ -525,24 +526,15 @@ static void r300SetDepthState(GLcontext * ctx)
        r300ContextPtr r300 = R300_CONTEXT(ctx);
 
        R300_STATECHANGE(r300, zs);
-       r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_RB3D_STENCIL_ENABLE;
-       r300->hw.zs.cmd[R300_ZS_CNTL_1] &=
-           ~(R300_ZS_MASK << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT);
+       r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE|R300_STENCIL_FRONT_BACK;
+       r300->hw.zs.cmd[R300_ZS_CNTL_1] &= ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
 
-       if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER) {
+       if (ctx->Depth.Test) {
+               r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE;
                if (ctx->Depth.Mask)
-                       r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
-                           R300_RB3D_Z_TEST_AND_WRITE;
-               else
-                       r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_RB3D_Z_TEST;
-
+                       r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_WRITE_ENABLE;
                r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   translate_func(ctx->Depth.
-                                  Func) << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT;
-       } else {
-               r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_RB3D_Z_DISABLED_1;
-               r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
-                   translate_func(GL_NEVER) << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT;
+                   translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
        }
 
        r300SetEarlyZState(ctx);
@@ -556,10 +548,10 @@ static void r300SetStencilState(GLcontext * ctx, GLboolean state)
                R300_STATECHANGE(r300, zs);
                if (state) {
                        r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
-                           R300_RB3D_STENCIL_ENABLE;
+                           R300_STENCIL_ENABLE;
                } else {
                        r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
-                           ~R300_RB3D_STENCIL_ENABLE;
+                           ~R300_STENCIL_ENABLE;
                }
        } else {
 #if R200_MERGED
@@ -571,7 +563,7 @@ static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 static void r300UpdatePolygonMode(GLcontext * ctx)
 {
        r300ContextPtr r300 = R300_CONTEXT(ctx);
-       uint32_t hw_mode = GA_POLY_MODE_DISABLE;
+       uint32_t hw_mode = R300_GA_POLY_MODE_DISABLE;
 
        /* Only do something if a polygon mode is wanted, default is GL_FILL */
        if (ctx->Polygon.FrontMode != GL_FILL ||
@@ -590,29 +582,29 @@ static void r300UpdatePolygonMode(GLcontext * ctx)
                }
 
                /* Enable polygon mode */
-               hw_mode |= GA_POLY_MODE_DUAL;
+               hw_mode |= R300_GA_POLY_MODE_DUAL;
 
                switch (f) {
                case GL_LINE:
-                       hw_mode |= GA_POLY_MODE_FRONT_PTYPE_LINE;
+                       hw_mode |= R300_GA_POLY_MODE_FRONT_PTYPE_LINE;
                        break;
                case GL_POINT:
-                       hw_mode |= GA_POLY_MODE_FRONT_PTYPE_POINT;
+                       hw_mode |= R300_GA_POLY_MODE_FRONT_PTYPE_POINT;
                        break;
                case GL_FILL:
-                       hw_mode |= GA_POLY_MODE_FRONT_PTYPE_TRI;
+                       hw_mode |= R300_GA_POLY_MODE_FRONT_PTYPE_TRI;
                        break;
                }
 
                switch (b) {
                case GL_LINE:
-                       hw_mode |= GA_POLY_MODE_BACK_PTYPE_LINE;
+                       hw_mode |= R300_GA_POLY_MODE_BACK_PTYPE_LINE;
                        break;
                case GL_POINT:
-                       hw_mode |= GA_POLY_MODE_BACK_PTYPE_POINT;
+                       hw_mode |= R300_GA_POLY_MODE_BACK_PTYPE_POINT;
                        break;
                case GL_FILL:
-                       hw_mode |= GA_POLY_MODE_BACK_PTYPE_TRI;
+                       hw_mode |= R300_GA_POLY_MODE_BACK_PTYPE_TRI;
                        break;
                }
        }
@@ -716,8 +708,8 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
                        R300_STATECHANGE(r300, fogs);
                        r300->hw.fogs.cmd[R300_FOGS_STATE] =
                            (r300->hw.fogs.
-                            cmd[R300_FOGS_STATE] & ~FG_FOG_BLEND_FN_MASK) |
-                           FG_FOG_BLEND_FN_LINEAR;
+                            cmd[R300_FOGS_STATE] & ~R300_FG_FOG_BLEND_FN_MASK) |
+                           R300_FG_FOG_BLEND_FN_LINEAR;
 
                        if (ctx->Fog.Start == ctx->Fog.End) {
                                fogScale.f = -1.0;
@@ -734,8 +726,8 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
                        R300_STATECHANGE(r300, fogs);
                        r300->hw.fogs.cmd[R300_FOGS_STATE] =
                            (r300->hw.fogs.
-                            cmd[R300_FOGS_STATE] & ~FG_FOG_BLEND_FN_MASK) |
-                           FG_FOG_BLEND_FN_EXP;
+                            cmd[R300_FOGS_STATE] & ~R300_FG_FOG_BLEND_FN_MASK) |
+                           R300_FG_FOG_BLEND_FN_EXP;
                        fogScale.f = 0.0933 * ctx->Fog.Density;
                        fogStart.f = 0.0;
                        break;
@@ -743,8 +735,8 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
                        R300_STATECHANGE(r300, fogs);
                        r300->hw.fogs.cmd[R300_FOGS_STATE] =
                            (r300->hw.fogs.
-                            cmd[R300_FOGS_STATE] & ~FG_FOG_BLEND_FN_MASK) |
-                           FG_FOG_BLEND_FN_EXP2;
+                            cmd[R300_FOGS_STATE] & ~R300_FG_FOG_BLEND_FN_MASK) |
+                           R300_FG_FOG_BLEND_FN_EXP2;
                        fogScale.f = 0.3 * ctx->Fog.Density;
                        fogStart.f = 0.0;
                default:
@@ -808,7 +800,7 @@ static void r300SetFogState(GLcontext * ctx, GLboolean state)
 
        R300_STATECHANGE(r300, fogs);
        if (state) {
-               r300->hw.fogs.cmd[R300_FOGS_STATE] |= FG_FOG_BLEND_ENABLE;
+               r300->hw.fogs.cmd[R300_FOGS_STATE] |= R300_FG_FOG_BLEND_ENABLE;
 
                r300Fogfv(ctx, GL_FOG_MODE, NULL);
                r300Fogfv(ctx, GL_FOG_DENSITY, &ctx->Fog.Density);
@@ -816,7 +808,7 @@ static void r300SetFogState(GLcontext * ctx, GLboolean state)
                r300Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
                r300Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
        } else {
-               r300->hw.fogs.cmd[R300_FOGS_STATE] &= ~FG_FOG_BLEND_ENABLE;
+               r300->hw.fogs.cmd[R300_FOGS_STATE] &= ~R300_FG_FOG_BLEND_ENABLE;
        }
 }
 
@@ -835,6 +827,31 @@ static void r300PointSize(GLcontext * ctx, GLfloat size)
            ((int)(size * 6) << R300_POINTSIZE_Y_SHIFT);
 }
 
+static void r300PointParameter(GLcontext * ctx, GLenum pname, const GLfloat * param)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       switch (pname) {
+       case GL_POINT_SIZE_MIN:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MIN_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MinSize * 16.0);
+               break;
+       case GL_POINT_SIZE_MAX:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MAX_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MaxSize * 16.0)
+                       << R300_GA_POINT_MINMAX_MAX_SHIFT;
+               break;
+       case GL_POINT_DISTANCE_ATTENUATION:
+               break;
+       case GL_POINT_FADE_THRESHOLD_SIZE:
+               break;
+       default:
+               break;
+       }
+}
+
 /* =============================================================
  * Line state
  */
@@ -914,36 +931,36 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        GLuint refmask =
            (((ctx->Stencil.
-              Ref[0] & 0xff) << ZB_STENCILREFMASK_STENCILREF_SHIFT) | ((ctx->
-                                                                     Stencil.
-                                                                     ValueMask
-                                                                     [0] &
-                                                                     0xff)
-                                                                    <<
-                                                                    ZB_STENCILREFMASK_STENCILMASK_SHIFT));
+              Ref[0] & 0xff) << R300_STENCILREF_SHIFT) | ((ctx->
+                                                           Stencil.
+                                                           ValueMask
+                                                           [0] &
+                                                           0xff)
+                                                          <<
+                                                          R300_STENCILMASK_SHIFT));
 
        GLuint flag;
 
        R300_STATECHANGE(rmesa, zs);
-
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &= ~((R300_ZS_MASK <<
-                                              R300_RB3D_ZS1_FRONT_FUNC_SHIFT)
+                                              R300_S_FRONT_FUNC_SHIFT)
                                              | (R300_ZS_MASK <<
-                                                R300_RB3D_ZS1_BACK_FUNC_SHIFT));
+                                                R300_S_BACK_FUNC_SHIFT));
 
        rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
-           ~((ZB_STENCILREFMASK_STENCIL_MASK << ZB_STENCILREFMASK_STENCILREF_SHIFT) |
-             (ZB_STENCILREFMASK_STENCIL_MASK << ZB_STENCILREFMASK_STENCILMASK_SHIFT));
+           ~((R300_STENCILREF_MASK << R300_STENCILREF_SHIFT) |
+             (R300_STENCILREF_MASK << R300_STENCILMASK_SHIFT));
 
        flag = translate_func(ctx->Stencil.Function[0]);
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-           (flag << R300_RB3D_ZS1_FRONT_FUNC_SHIFT);
+           (flag << R300_S_FRONT_FUNC_SHIFT);
 
        if (ctx->Stencil._TestTwoSide)
                flag = translate_func(ctx->Stencil.Function[1]);
 
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-           (flag << R300_RB3D_ZS1_BACK_FUNC_SHIFT);
+           (flag << R300_S_BACK_FUNC_SHIFT);
        rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |= refmask;
 }
 
@@ -953,11 +970,12 @@ static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 
        R300_STATECHANGE(rmesa, zs);
        rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
-           ~(ZB_STENCILREFMASK_STENCIL_MASK <<
-             ZB_STENCILREFMASK_STENCILWRITEMASK_SHIFT);
+           ~(R300_STENCILREF_MASK <<
+             R300_STENCILWRITEMASK_SHIFT);
        rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |=
            (ctx->Stencil.
-            WriteMask[0] & 0xff) << ZB_STENCILREFMASK_STENCILWRITEMASK_SHIFT;
+            WriteMask[0] & R300_STENCILREF_MASK) <<
+            R300_STENCILWRITEMASK_SHIFT;
 }
 
 static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
@@ -968,49 +986,37 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
        R300_STATECHANGE(rmesa, zs);
        /* It is easier to mask what's left.. */
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &=
-           (R300_ZS_MASK << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT) |
-           (R300_ZS_MASK << R300_RB3D_ZS1_FRONT_FUNC_SHIFT) |
-           (R300_ZS_MASK << R300_RB3D_ZS1_BACK_FUNC_SHIFT);
+           (R300_ZS_MASK << R300_Z_FUNC_SHIFT) |
+           (R300_ZS_MASK << R300_S_FRONT_FUNC_SHIFT) |
+           (R300_ZS_MASK << R300_S_BACK_FUNC_SHIFT);
 
        rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
            (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
-            R300_RB3D_ZS1_FRONT_FAIL_OP_SHIFT)
+            R300_S_FRONT_SFAIL_OP_SHIFT)
            | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
-              R300_RB3D_ZS1_FRONT_ZFAIL_OP_SHIFT)
+              R300_S_FRONT_ZFAIL_OP_SHIFT)
            | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
-              R300_RB3D_ZS1_FRONT_ZPASS_OP_SHIFT);
+              R300_S_FRONT_ZPASS_OP_SHIFT);
 
        if (ctx->Stencil._TestTwoSide) {
                rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
                    (translate_stencil_op(ctx->Stencil.FailFunc[1]) <<
-                    R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT)
+                    R300_S_BACK_SFAIL_OP_SHIFT)
                    | (translate_stencil_op(ctx->Stencil.ZFailFunc[1]) <<
-                      R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT)
+                      R300_S_BACK_ZFAIL_OP_SHIFT)
                    | (translate_stencil_op(ctx->Stencil.ZPassFunc[1]) <<
-                      R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT);
+                      R300_S_BACK_ZPASS_OP_SHIFT);
        } else {
                rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
                    (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
-                    R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT)
+                    R300_S_BACK_SFAIL_OP_SHIFT)
                    | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
-                      R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT)
+                      R300_S_BACK_ZFAIL_OP_SHIFT)
                    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
-                      R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT);
+                      R300_S_BACK_ZPASS_OP_SHIFT);
        }
 }
 
-static void r300ClearStencil(GLcontext * ctx, GLint s)
-{
-       r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-       rmesa->state.stencil.clear =
-           ((GLuint) (ctx->Stencil.Clear & 0xff) |
-            (ZB_STENCILREFMASK_STENCIL_MASK <<
-             ZB_STENCILREFMASK_STENCILMASK_SHIFT) | ((ctx->Stencil.
-                                                   WriteMask[0] & 0xff) <<
-                                                  ZB_STENCILREFMASK_STENCILMASK_SHIFT));
-}
-
 /* =============================================================
  * Window position and viewport transformation
  */
@@ -1275,8 +1281,8 @@ static unsigned long gen_fixed_filter(unsigned long f)
            (R300_TX_CLAMP << R300_TX_WRAP_T_SHIFT)) {
                needs_fixing |= 2;
        }
-       if ((f & ((7 - 1) << R300_TX_WRAP_Q_SHIFT)) ==
-           (R300_TX_CLAMP << R300_TX_WRAP_Q_SHIFT)) {
+       if ((f & ((7 - 1) << R300_TX_WRAP_R_SHIFT)) ==
+           (R300_TX_CLAMP << R300_TX_WRAP_R_SHIFT)) {
                needs_fixing |= 4;
        }
 
@@ -1284,7 +1290,7 @@ static unsigned long gen_fixed_filter(unsigned long f)
                return f;
 
        mag = f & R300_TX_MAG_FILTER_MASK;
-       min = f & R300_TX_MIN_FILTER_MASK;
+       min = f & (R300_TX_MIN_FILTER_MASK|R300_TX_MIN_FILTER_MIP_MASK);
 
        /* TODO: Check for anisto filters too */
        if ((mag != R300_TX_MAG_FILTER_NEAREST)
@@ -1316,12 +1322,100 @@ static unsigned long gen_fixed_filter(unsigned long f)
                f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_T_SHIFT;
        }
        if (needs_fixing & 4) {
-               f &= ~((7 - 1) << R300_TX_WRAP_Q_SHIFT);
-               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_Q_SHIFT;
+               f &= ~((7 - 1) << R300_TX_WRAP_R_SHIFT);
+               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_R_SHIFT;
        }
        return f;
 }
 
+static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       int i;
+       struct r300_fragment_program *fp = (struct r300_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+       struct r300_fragment_program_code *code = &fp->code;
+
+       R300_STATECHANGE(r300, fpt);
+
+       for (i = 0; i < code->tex.length; i++) {
+               int unit;
+               int opcode;
+               unsigned long val;
+
+               unit = code->tex.inst[i] >> R300_TEX_ID_SHIFT;
+               unit &= 15;
+
+               val = code->tex.inst[i];
+               val &= ~R300_TEX_ID_MASK;
+
+               opcode =
+                       (val & R300_TEX_INST_MASK) >> R300_TEX_INST_SHIFT;
+               if (opcode == R300_TEX_OP_KIL) {
+                       r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+               } else {
+                       if (tmu_mappings[unit] >= 0) {
+                               val |=
+                                       tmu_mappings[unit] <<
+                                       R300_TEX_ID_SHIFT;
+                               r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+                       } else {
+                               // We get here when the corresponding texture image is incomplete
+                               // (e.g. incomplete mipmaps etc.)
+                               r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+                       }
+               }
+       }
+
+       r300->hw.fpt.cmd[R300_FPT_CMD_0] =
+               cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
+}
+
+static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
+{
+       int i;
+       struct r500_fragment_program *fp = (struct r500_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+       struct r500_fragment_program_code *code = &fp->code;
+
+       /* find all the texture instructions and relocate the texture units */
+       for (i = 0; i < code->inst_end + 1; i++) {
+               if ((code->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
+                       uint32_t val;
+                       int unit, opcode, new_unit;
+
+                       val = code->inst[i].inst1;
+
+                       unit = (val >> 16) & 0xf;
+
+                       val &= ~(0xf << 16);
+
+                       opcode = val & (0x7 << 22);
+                       if (opcode == R500_TEX_INST_TEXKILL) {
+                               new_unit = 0;
+                       } else {
+                               if (tmu_mappings[unit] >= 0) {
+                                       new_unit = tmu_mappings[unit];
+                               } else {
+                                       new_unit = 0;
+                               }
+                       }
+                       val |= R500_TEX_ID(new_unit);
+                       code->inst[i].inst1 = val;
+               }
+       }
+}
+
+static GLuint translate_lod_bias(GLfloat bias)
+{
+       GLint b = (int)(bias*32);
+       if (b >= (1 << 9))
+               b = (1 << 9)-1;
+       else if (b < -(1 << 9))
+               b = -(1 << 9);
+       return (((GLuint)b) << R300_LOD_BIAS_SHIFT) & R300_LOD_BIAS_MASK;
+}
+
 static void r300SetupTextures(GLcontext * ctx)
 {
        int i, mtu;
@@ -1385,8 +1479,14 @@ static void r300SetupTextures(GLcontext * ctx)
                        r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
                                                hw_tmu] =
                            gen_fixed_filter(t->filter) | (hw_tmu << 28);
-                       /* Currently disabled! */
-                       r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = 0x0;     //0x20501f80;
+                       /* Note: There is a LOD bias per texture unit and a LOD bias
+                        * per texture object. We add them here to get the correct behaviour.
+                        * (The per-texture object LOD bias was introduced in OpenGL 1.4
+                        * and is not present in the EXT_texture_object extension).
+                        */
+                       r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+                               t->filter_1 |
+                               translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
                        r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
                            t->size;
                        r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
@@ -1436,39 +1536,18 @@ static void r300SetupTextures(GLcontext * ctx)
        if (!fp)                /* should only happenen once, just after context is created */
                return;
 
-       R300_STATECHANGE(r300, fpt);
-
-       for (i = 0; i < fp->tex.length; i++) {
-               int unit;
-               int opcode;
-               unsigned long val;
-
-               unit = fp->tex.inst[i] >> R300_FPITX_IMAGE_SHIFT;
-               unit &= 15;
-
-               val = fp->tex.inst[i];
-               val &= ~R300_FPITX_IMAGE_MASK;
-
-               opcode =
-                   (val & R300_FPITX_OPCODE_MASK) >> R300_FPITX_OPCODE_SHIFT;
-               if (opcode == R300_FPITX_OP_KIL) {
-                       r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
-               } else {
-                       if (tmu_mappings[unit] >= 0) {
-                               val |=
-                                   tmu_mappings[unit] <<
-                                   R300_FPITX_IMAGE_SHIFT;
-                               r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
-                       } else {
-                               // We get here when the corresponding texture image is incomplete
-                               // (e.g. incomplete mipmaps etc.)
-                               r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
-                       }
+       if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+               if (fp->mesa_program.UsesKill && last_hw_tmu < 0) {
+                       // The KILL operation requires the first texture unit
+                       // to be enabled.
+                       r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
+                       r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
+                       r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+                               cmdpacket0(R300_TX_FILTER0_0, 1);
                }
-       }
-
-       r300->hw.fpt.cmd[R300_FPT_CMD_0] =
-           cmdpacket0(R300_PFS_TEXI_0, fp->tex.length);
+               r300SetupFragmentShaderTextures(ctx, tmu_mappings);
+       } else
+               r500SetupFragmentShaderTextures(ctx, tmu_mappings);
 
        if (RADEON_DEBUG & DEBUG_STATE)
                fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
@@ -1488,21 +1567,17 @@ static void r300SetupRSUnit(GLcontext * ctx)
 {
        r300ContextPtr r300 = R300_CONTEXT(ctx);
        /* I'm still unsure if these are needed */
-       GLuint interp_magic[8] = {
-               0x00,
-               R300_RS_COL_PTR(1),
-               R300_RS_COL_PTR(2),
-               R300_RS_COL_PTR(3),
-               0x00,
-               0x00,
-               0x00,
-               0x00
-       };
+       GLuint interp_col[8];
+        TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *VB = &tnl->vb;
        union r300_outputs_written OutputsWritten;
        GLuint InputsRead;
        int fp_reg, high_rr;
-       int in_texcoords, col_interp_nr;
-       int i;
+       int col_interp_nr;
+       int rs_tex_count = 0, rs_col_count = 0;
+       int i, count;
+
+       memset(interp_col, 0, sizeof(interp_col));
 
        if (hw_tcl_on)
                OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
@@ -1520,9 +1595,9 @@ static void r300SetupRSUnit(GLcontext * ctx)
        R300_STATECHANGE(r300, rc);
        R300_STATECHANGE(r300, rr);
 
-       fp_reg = in_texcoords = col_interp_nr = high_rr = 0;
+       fp_reg = col_interp_nr = high_rr = 0;
 
-       r300->hw.rr.cmd[R300_RR_ROUTE_1] = 0;
+       r300->hw.rr.cmd[R300_RR_INST_1] = 0;
 
        if (InputsRead & FRAG_BIT_WPOS) {
                for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
@@ -1538,15 +1613,53 @@ static void r300SetupRSUnit(GLcontext * ctx)
                InputsRead &= ~FRAG_BIT_WPOS;
        }
 
+       if (InputsRead & FRAG_BIT_COL0) {
+               count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+               interp_col[0] |= R300_RS_COL_PTR(rs_col_count);
+               if (count == 3)
+                       interp_col[0] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
+               rs_col_count += count;
+       }
+       else
+               interp_col[0] = R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+
+       if (InputsRead & FRAG_BIT_COL1) {
+               count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+               if (count == 3)
+                       interp_col[1] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
+               interp_col[1] |= R300_RS_COL_PTR(1);
+               rs_col_count += count;
+       }
+
+
        for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0 | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) | (in_texcoords << R300_RS_INTERP_SRC_SHIFT)
-                   | interp_magic[i];
+               int swiz;
+
+               /* with TCL we always seem to route 4 components */
+               if (hw_tcl_on)
+                 count = 4;
+               else
+                 count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
+
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | rs_tex_count;
+               switch(count) {
+               case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
+               case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
+               default:
+               case 1:
+               case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
+               };
 
-               r300->hw.rr.cmd[R300_RR_ROUTE_0 + fp_reg] = 0;
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] |= swiz;
+
+               r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
                if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+
+                       rs_tex_count += count;
+
                        //assert(r300->state.texture.tc_count != 0);
-                       r300->hw.rr.cmd[R300_RR_ROUTE_0 + fp_reg] |= R300_RS_ROUTE_ENABLE | i   /* source INTERP */
-                           | (fp_reg << R300_RS_ROUTE_DEST_SHIFT);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R300_RS_INST_TEX_CN_WRITE | i       /* source INTERP */
+                           | (fp_reg << R300_RS_INST_TEX_ADDR_SHIFT);
                        high_rr = fp_reg;
 
                        /* Passing invalid data here can lock the GPU. */
@@ -1557,15 +1670,171 @@ static void r300SetupRSUnit(GLcontext * ctx)
                                WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
                        }
                }
-               /* Need to count all coords enabled at vof */
-               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
-                       in_texcoords++;
+       }
+
+       if (InputsRead & FRAG_BIT_COL0) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+                       r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
+                       InputsRead &= ~FRAG_BIT_COL0;
+                       col_interp_nr++;
+               } else {
+                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+               }
+       }
+
+       if (InputsRead & FRAG_BIT_COL1) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+                       r300->hw.rr.cmd[R300_RR_INST_1] |= R300_RS_INST_COL_ID(1) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
+                       InputsRead &= ~FRAG_BIT_COL1;
+                       if (high_rr < 1)
+                               high_rr = 1;
+                       col_interp_nr++;
+               } else {
+                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+               }
+       }
+
+       /* Need at least one. This might still lock as the values are undefined... */
+       if (rs_tex_count == 0 && col_interp_nr == 0) {
+               r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
+               col_interp_nr++;
+       }
+
+       r300->hw.rc.cmd[1] = 0 | (rs_tex_count << R300_IT_COUNT_SHIFT)
+         | (col_interp_nr << R300_IC_COUNT_SHIFT)
+         | R300_HIRES_EN;
+
+       assert(high_rr >= 0);
+       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr + 1);
+       r300->hw.rc.cmd[2] = high_rr;
+
+       if (InputsRead)
+               WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
+}
+
+static void r500SetupRSUnit(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       /* I'm still unsure if these are needed */
+       GLuint interp_col[8];
+       union r300_outputs_written OutputsWritten;
+        TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *VB = &tnl->vb;
+       GLuint InputsRead;
+       int fp_reg, high_rr;
+       int rs_col_count = 0;
+       int in_texcoords, col_interp_nr;
+       int i, count;
+
+       memset(interp_col, 0, sizeof(interp_col));
+       if (hw_tcl_on)
+               OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+       else
+               RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+
+       if (ctx->FragmentProgram._Current)
+               InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+       else {
+               fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+               return;         /* This should only ever happen once.. */
+       }
+
+       R300_STATECHANGE(r300, ri);
+       R300_STATECHANGE(r300, rc);
+       R300_STATECHANGE(r300, rr);
+
+       fp_reg = col_interp_nr = high_rr = in_texcoords = 0;
+
+       r300->hw.rr.cmd[R300_RR_INST_1] = 0;
+
+       if (InputsRead & FRAG_BIT_WPOS) {
+               for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+                       if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
+                               break;
+
+               if (i == ctx->Const.MaxTextureUnits) {
+                       fprintf(stderr, "\tno free texcoord found...\n");
+                       _mesa_exit(-1);
+               }
+
+               InputsRead |= (FRAG_BIT_TEX0 << i);
+               InputsRead &= ~FRAG_BIT_WPOS;
+       }
+
+       if (InputsRead & FRAG_BIT_COL0) {
+               count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+               interp_col[0] |= R500_RS_COL_PTR(rs_col_count);
+               if (count == 3)
+                       interp_col[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
+               rs_col_count += count;
+       }
+       else
+               interp_col[0] = R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+
+       if (InputsRead & FRAG_BIT_COL1) {
+               count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+               interp_col[1] |= R500_RS_COL_PTR(1);
+               if (count == 3)
+                       interp_col[1] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
+               rs_col_count += count;
+       }
+
+       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+               GLuint swiz = 0;
+
+               /* with TCL we always seem to route 4 components */
+               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+
+                 if (hw_tcl_on)
+                   count = 4;
+                 else
+                   count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
+
+                 /* always have on texcoord */
+                 swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_S_SHIFT;
+                 if (count >= 2)
+                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_T_SHIFT;
+                 else
+                   swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
+
+                 if (count >= 3)
+                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_R_SHIFT;
+                 else
+                   swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+
+                 if (count == 4)
+                   swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_Q_SHIFT;
+                 else
+                   swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+
+               } else
+                  swiz = (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                         (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                         (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                         (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | swiz;
+
+               r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
+               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+                       //assert(r300->state.texture.tc_count != 0);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R500_RS_INST_TEX_CN_WRITE | i       /* source INTERP */
+                           | (fp_reg << R500_RS_INST_TEX_ADDR_SHIFT);
+                       high_rr = fp_reg;
+
+                       /* Passing invalid data here can lock the GPU. */
+                       if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+                               InputsRead &= ~(FRAG_BIT_TEX0 << i);
+                               fp_reg++;
+                       } else {
+                               WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+                       }
                }
        }
 
        if (InputsRead & FRAG_BIT_COL0) {
                if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-                       r300->hw.rr.cmd[R300_RR_ROUTE_0] |= 0 | R300_RS_ROUTE_0_COLOR | (fp_reg++ << R300_RS_ROUTE_0_COLOR_DEST_SHIFT);
+                       r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
                        InputsRead &= ~FRAG_BIT_COL0;
                        col_interp_nr++;
                } else {
@@ -1575,7 +1844,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
        if (InputsRead & FRAG_BIT_COL1) {
                if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-                       r300->hw.rr.cmd[R300_RR_ROUTE_1] |= R300_RS_ROUTE_1_UNKNOWN11 | R300_RS_ROUTE_1_COLOR1 | (fp_reg++ << R300_RS_ROUTE_1_COLOR1_DEST_SHIFT);
+                       r300->hw.rr.cmd[R300_RR_INST_1] |= (1 << 12) | R500_RS_INST_COL_CN_WRITE |  (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
                        InputsRead &= ~FRAG_BIT_COL1;
                        if (high_rr < 1)
                                high_rr = 1;
@@ -1587,22 +1856,25 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
        /* Need at least one. This might still lock as the values are undefined... */
        if (in_texcoords == 0 && col_interp_nr == 0) {
-               r300->hw.rr.cmd[R300_RR_ROUTE_0] |= 0 | R300_RS_ROUTE_0_COLOR | (fp_reg++ << R300_RS_ROUTE_0_COLOR_DEST_SHIFT);
+               r300->hw.rr.cmd[R300_RR_INST_0] |= 0 | R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
                col_interp_nr++;
        }
 
-       r300->hw.rc.cmd[1] = 0 | ((in_texcoords << 2) << R300_IT_COUNT_SHIFT)
+       r300->hw.rc.cmd[1] = 0 | (in_texcoords << R300_IT_COUNT_SHIFT)
          | (col_interp_nr << R300_IC_COUNT_SHIFT)
          | R300_HIRES_EN;
 
        assert(high_rr >= 0);
-       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_ROUTE_0, high_rr + 1);
+       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr + 1);
        r300->hw.rc.cmd[2] = 0xC0 | high_rr;
 
        if (InputsRead)
                WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
 }
 
+
+
+
 #define bump_vpu_count(ptr, new_count)   do{\
        drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
        int _nc=(new_count)/4; \
@@ -1610,7 +1882,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
        if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
        }while(0)
 
-static inline void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
+static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
 {
        int i;
 
@@ -1648,10 +1920,67 @@ static inline void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest,
        }
 }
 
+#define MIN3(a, b, c)  ((a) < (b) ? MIN2(a, c) : MIN2(b, c))
+
+
+static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
+                       GLuint output_count, GLuint temp_count)
+{
+    int vtx_mem_size;
+    int pvs_num_slots;
+    int pvs_num_cntrls;
+
+    /* Flush PVS engine before changing PVS_NUM_SLOTS, PVS_NUM_CNTRLS.
+     * See r500 docs 6.5.2 - done in emit */
+
+    /* avoid division by zero */
+    if (input_count == 0) input_count = 1;
+    if (output_count == 0) output_count = 1;
+    if (temp_count == 0) temp_count = 1;
+
+    if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+       vtx_mem_size = 128;
+    else
+       vtx_mem_size = 72;
+
+    pvs_num_slots = MIN3(10, vtx_mem_size/input_count, vtx_mem_size/output_count);
+    pvs_num_cntrls = MIN2(6, vtx_mem_size/temp_count);
+
+    R300_STATECHANGE(rmesa, vap_cntl);
+    if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] =
+           (pvs_num_slots << R300_PVS_NUM_SLOTS_SHIFT) |
+           (pvs_num_cntrls << R300_PVS_NUM_CNTLRS_SHIFT) |
+           (12 << R300_VF_MAX_VTX_NUM_SHIFT);
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+           rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= R500_TCL_STATE_OPTIMIZATION;
+    } else
+       /* not sure about non-tcl */
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+                                   (5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+                                   (5 << R300_VF_MAX_VTX_NUM_SHIFT));
+
+    if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+    else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560))
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+    else if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420)
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+    else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+    else
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+
+}
+
 static void r300SetupDefaultVertexProgram(r300ContextPtr rmesa)
 {
        struct r300_vertex_shader_state *prog = &(rmesa->state.vertex_shader);
        GLuint o_reg = 0;
+       GLuint i_reg = 0;
        int i;
        int inst_count = 0;
        int param_count = 0;
@@ -1659,31 +1988,42 @@ static void r300SetupDefaultVertexProgram(r300ContextPtr rmesa)
 
        for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++) {
                if (rmesa->state.sw_tcl_inputs[i] != -1) {
-                       prog->program.body.i[program_end + 0] = EASY_VSF_OP(MUL, o_reg++, ALL, RESULT);
-                       prog->program.body.i[program_end + 1] = VSF_REG(rmesa->state.sw_tcl_inputs[i]);
-                       prog->program.body.i[program_end + 2] = VSF_ATTR_UNITY(rmesa->state.sw_tcl_inputs[i]);
-                       prog->program.body.i[program_end + 3] = VSF_UNITY(rmesa->state.sw_tcl_inputs[i]);
+                       prog->program.body.i[program_end + 0] = PVS_OP_DST_OPERAND(VE_MULTIPLY, GL_FALSE, GL_FALSE, o_reg++, VSF_FLAG_ALL, PVS_DST_REG_OUT);
+                       prog->program.body.i[program_end + 1] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+                       prog->program.body.i[program_end + 2] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+                       prog->program.body.i[program_end + 3] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
                        program_end += 4;
+                       i_reg++;
                }
        }
 
        prog->program.length = program_end;
 
-       r300SetupVertexProgramFragment(rmesa, R300_PVS_UPLOAD_PROGRAM,
+       r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START,
                                       &(prog->program));
        inst_count = (prog->program.length / 4) - 1;
 
+       r300VapCntl(rmesa, i_reg, o_reg, 0);
+
        R300_STATECHANGE(rmesa, pvs);
        rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-           (0 << R300_PVS_CNTL_1_PROGRAM_START_SHIFT) |
-           (inst_count << R300_PVS_CNTL_1_POS_END_SHIFT) |
-           (inst_count << R300_PVS_CNTL_1_PROGRAM_END_SHIFT);
+           (0 << R300_PVS_FIRST_INST_SHIFT) |
+           (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+           (inst_count << R300_PVS_LAST_INST_SHIFT);
        rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-           (0 << R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT) |
-           (param_count << R300_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+           (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+           (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
        rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-           (inst_count << R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT) |
-           (inst_count << R300_PVS_CNTL_3_PROGRAM_UNKNOWN2_SHIFT);
+           (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+}
+
+static int bit_count (int x)
+{
+    x = ((x & 0xaaaaaaaaU) >> 1) + (x & 0x55555555U);
+    x = ((x & 0xccccccccU) >> 2) + (x & 0x33333333U);
+    x = (x >> 16) + (x & 0xffff);
+    x = ((x & 0xf0f0) >> 4) + (x & 0x0f0f);
+    return (x >> 8) + (x & 0x00ff);
 }
 
 static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
@@ -1704,20 +2044,22 @@ static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
        bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
        param_count /= 4;
 
-       r300SetupVertexProgramFragment(rmesa, R300_PVS_UPLOAD_PROGRAM, &(prog->program));
+       r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START, &(prog->program));
        inst_count = (prog->program.length / 4) - 1;
 
+       r300VapCntl(rmesa, bit_count(prog->key.InputsRead),
+                   bit_count(prog->key.OutputsWritten), prog->num_temporaries);
+
        R300_STATECHANGE(rmesa, pvs);
        rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-         (0 << R300_PVS_CNTL_1_PROGRAM_START_SHIFT) |
-         (inst_count << R300_PVS_CNTL_1_POS_END_SHIFT) |
-         (inst_count << R300_PVS_CNTL_1_PROGRAM_END_SHIFT);
+         (0 << R300_PVS_FIRST_INST_SHIFT) |
+         (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+         (inst_count << R300_PVS_LAST_INST_SHIFT);
        rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-         (0 << R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT) |
-         (param_count << R300_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+         (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+         (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
        rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-         (inst_count << R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT) |
-         (inst_count << R300_PVS_CNTL_3_PROGRAM_UNKNOWN2_SHIFT);
+         (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
 }
 
 static void r300SetupVertexProgram(r300ContextPtr rmesa)
@@ -1740,13 +2082,6 @@ static void r300SetupVertexProgram(r300ContextPtr rmesa)
                r300SetupDefaultVertexProgram(rmesa);
        }
 
-
-       /* FIXME: This is done for vertex shader fragments, but also needs to be
-        * done for vap_pvs, so I leave it as a reminder. */
-#if 0
-       reg_start(R300_VAP_PVS_WAITIDLE, 0);
-       e32(0x00000000);
-#endif
 }
 
 /**
@@ -1848,11 +2183,6 @@ static void r300ResetHwState(r300ContextPtr r300)
        r300AlphaFunc(ctx, ctx->Color.AlphaFunc, ctx->Color.AlphaRef);
        r300Enable(ctx, GL_ALPHA_TEST, ctx->Color.AlphaEnabled);
 
-       if (!has_tcl)
-               r300->hw.vap_cntl.cmd[1] = 0x0014045a;
-       else
-               r300->hw.vap_cntl.cmd[1] = 0x0030045A;  //0x0030065a /* Dangerous */
-
        r300->hw.vte.cmd[1] = R300_VPORT_X_SCALE_ENA
            | R300_VPORT_X_OFFSET_ENA
            | R300_VPORT_Y_SCALE_ENA
@@ -1878,7 +2208,7 @@ static void r300ResetHwState(r300ContextPtr r300)
 
        /* XXX: Other families? */
        if (has_tcl) {
-               r300->hw.vap_clip_cntl.cmd[1] = R300_221C_NORMAL;
+               r300->hw.vap_clip_cntl.cmd[1] = R300_PS_UCP_MODE_DIST_COP;
 
                r300->hw.vap_clip.cmd[1] = r300PackFloat32(1.0); /* X */
                r300->hw.vap_clip.cmd[2] = r300PackFloat32(1.0); /* X */
@@ -1902,23 +2232,25 @@ static void r300ResetHwState(r300ContextPtr r300)
        r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_0] = 0x66666666;
        r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_1] = 0x06666666;
 
-       /* XXX: Other families? */
        r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
-           R300_GB_TILE_ENABLE | R300_GB_TILE_SIZE_16;
-       switch (r300->radeon.radeonScreen->chip_family) {
-       case CHIP_FAMILY_R300:
-       case CHIP_FAMILY_R350:
-       case CHIP_FAMILY_RV410:
+           R300_GB_TILE_ENABLE | R300_GB_TILE_SIZE_16 /*| R300_GB_SUBPIXEL_1_16*/;
+       switch (r300->radeon.radeonScreen->num_gb_pipes) {
+       case 1:
+       default:
+               r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
+                   R300_GB_TILE_PIPE_COUNT_RV300;
+               break;
+       case 2:
                r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
                    R300_GB_TILE_PIPE_COUNT_R300;
                break;
-       case CHIP_FAMILY_R420:
+       case 3:
                r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
-                   R300_GB_TILE_PIPE_COUNT_R420;
+                   R300_GB_TILE_PIPE_COUNT_R420_3P;
                break;
-       default:
+       case 4:
                r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
-                   R300_GB_TILE_DISABLE; /* TODO: This disables tiling totally. I guess it happened accidentially. */
+                   R300_GB_TILE_PIPE_COUNT_R420;
                break;
        }
 
@@ -1967,11 +2299,15 @@ static void r300ResetHwState(r300ContextPtr r300)
 
        r300->hw.sc_screendoor.cmd[1] = 0x00FFFFFF;
 
-       r300->hw.us_out_fmt.cmd[1] = 0x00001B01;
-       r300->hw.us_out_fmt.cmd[2] = 0x00001B0F;
-       r300->hw.us_out_fmt.cmd[3] = 0x00001B0F;
-       r300->hw.us_out_fmt.cmd[4] = 0x00001B0F;
-       r300->hw.us_out_fmt.cmd[5] = 0x00000001;
+       r300->hw.us_out_fmt.cmd[1] = R500_OUT_FMT_C4_8  |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[2] = R500_OUT_FMT_UNUSED |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[3] = R500_OUT_FMT_UNUSED |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[4] = R500_OUT_FMT_UNUSED |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[5] = R300_W_FMT_W24;
 
        r300Enable(ctx, GL_FOG, ctx->Fog.Enabled);
        r300Fogfv(ctx, GL_FOG_MODE, NULL);
@@ -2023,15 +2359,32 @@ static void r300ResetHwState(r300ContextPtr r300)
 
        if (r300->radeon.sarea->tiling_enabled) {
                /* XXX: Turn off when clearing buffers ? */
-               r300->hw.zb.cmd[R300_ZB_PITCH] |= ZB_DEPTHPITCH_DEPTHMACROTILE_ENABLE;
+               r300->hw.zb.cmd[R300_ZB_PITCH] |= R300_DEPTHMACROTILE_ENABLE;
 
                if (ctx->Visual.depthBits == 24)
                        r300->hw.zb.cmd[R300_ZB_PITCH] |=
-                           ZB_DEPTHPITCH_DEPTHMICROTILE_TILED;
+                           R300_DEPTHMICROTILE_TILED;
        }
 
        r300->hw.zb_depthclearvalue.cmd[1] = 0;
 
+       switch (ctx->Visual.depthBits) {
+       case 16:
+               r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_16BIT_INT_Z;
+               break;
+       case 24:
+               r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+               break;
+       default:
+               fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
+               _mesa_exit(-1);
+       }
+
+       r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
+       r300->hw.zstencil_format.cmd[3] = 0x00000003;
+       r300->hw.zstencil_format.cmd[4] = 0x00000000;
+       r300SetEarlyZState(ctx);
+
        r300->hw.unk4F30.cmd[1] = 0;
        r300->hw.unk4F30.cmd[2] = 0;
 
@@ -2039,6 +2392,7 @@ static void r300ResetHwState(r300ContextPtr r300)
 
        r300->hw.zb_hiz_pitch.cmd[1] = 0;
 
+       r300VapCntl(r300, 0, 0, 0);
        if (has_tcl) {
                r300->hw.vps.cmd[R300_VPS_ZERO_0] = 0;
                r300->hw.vps.cmd[R300_VPS_ZERO_1] = 0;
@@ -2084,10 +2438,11 @@ void r300UpdateShaders(r300ContextPtr rmesa)
                        hw_tcl_on = future_hw_tcl_on = 0;
                        r300ResetHwState(rmesa);
 
+                       r300UpdateStateParameters(ctx, _NEW_PROGRAM);
                        return;
                }
-               r300UpdateStateParameters(ctx, _NEW_PROGRAM);
        }
+       r300UpdateStateParameters(ctx, _NEW_PROGRAM);
 }
 
 static void r300SetupPixelShader(r300ContextPtr rmesa)
@@ -2095,6 +2450,7 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
        GLcontext *ctx = rmesa->radeon.glCtx;
        struct r300_fragment_program *fp = (struct r300_fragment_program *)
            (char *)ctx->FragmentProgram._Current;
+       struct r300_fragment_program_code *code;
        int i, k;
 
        if (!fp)                /* should only happenen once, just after context is created */
@@ -2106,76 +2462,169 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
                        __FUNCTION__);
                return;
        }
+       code = &fp->code;
+
+       r300SetupTextures(ctx);
 
        R300_STATECHANGE(rmesa, fpi[0]);
-       rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR0_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst0;
+       rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu_end + 1);
+       for (i = 0; i <= code->alu_end; i++) {
+               rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
        }
 
        R300_STATECHANGE(rmesa, fpi[1]);
-       rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR1_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst1;
+       rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu_end + 1);
+       for (i = 0; i <= code->alu_end; i++) {
+               rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
        }
 
        R300_STATECHANGE(rmesa, fpi[2]);
-       rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR2_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst2;
+       rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu_end + 1);
+       for (i = 0; i <= code->alu_end; i++) {
+               rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
        }
 
        R300_STATECHANGE(rmesa, fpi[3]);
-       rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR3_0, fp->alu_end + 1);
-       for (i = 0; i <= fp->alu_end; i++) {
-               rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst3;
+       rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu_end + 1);
+       for (i = 0; i <= code->alu_end; i++) {
+               rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3;
        }
 
        R300_STATECHANGE(rmesa, fp);
-       rmesa->hw.fp.cmd[R300_FP_CNTL0] = fp->cur_node | (fp->first_node_has_tex << 3);
-       rmesa->hw.fp.cmd[R300_FP_CNTL1] = fp->max_temp_idx;
+       rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3);
+       rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx;
        rmesa->hw.fp.cmd[R300_FP_CNTL2] =
-         (fp->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
-         (fp->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
-         (fp->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
-         (fp->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
+         (code->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
+         (code->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
+         (code->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
+         (code->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
        /* I just want to say, the way these nodes are stored.. weird.. */
-       for (i = 0, k = (4 - (fp->cur_node + 1)); i < 4; i++, k++) {
-               if (i < (fp->cur_node + 1)) {
+       for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) {
+               if (i < (code->cur_node + 1)) {
                        rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
-                         (fp->node[i].alu_offset << R300_PFS_NODE_ALU_OFFSET_SHIFT) |
-                         (fp->node[i].alu_end << R300_PFS_NODE_ALU_END_SHIFT) |
-                         (fp->node[i].tex_offset << R300_PFS_NODE_TEX_OFFSET_SHIFT) |
-                         (fp->node[i].tex_end << R300_PFS_NODE_TEX_END_SHIFT) |
-                         fp->node[i].flags;
+                         (code->node[i].alu_offset << R300_ALU_START_SHIFT) |
+                         (code->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
+                         (code->node[i].tex_offset << R300_TEX_START_SHIFT) |
+                         (code->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
+                         code->node[i].flags;
                } else {
                        rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
                }
        }
 
        R300_STATECHANGE(rmesa, fpp);
-       rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, fp->const_nr * 4);
-       for (i = 0; i < fp->const_nr; i++) {
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(fp->constant[i][0]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(fp->constant[i][1]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(fp->constant[i][2]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(fp->constant[i][3]);
+       rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
+       for (i = 0; i < code->const_nr; i++) {
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(code->constant[i][0]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(code->constant[i][1]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(code->constant[i][2]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(code->constant[i][3]);
        }
 }
 
+#define bump_r500fp_count(ptr, new_count)   do{\
+       drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
+       int _nc=(new_count)/6; \
+       assert(_nc < 256); \
+       if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
+} while(0)
+
+#define bump_r500fp_const_count(ptr, new_count)   do{\
+       drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
+       int _nc=(new_count)/4; \
+       assert(_nc < 256); \
+       if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
+} while(0)
+
+static void r500SetupPixelShader(r300ContextPtr rmesa)
+{
+       GLcontext *ctx = rmesa->radeon.glCtx;
+       struct r500_fragment_program *fp = (struct r500_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+       int i;
+       struct r500_fragment_program_code *code;
+
+       if (!fp)                /* should only happenen once, just after context is created */
+               return;
+
+       ((drm_r300_cmd_header_t *) rmesa->hw.r500fp.cmd)->r500fp.count = 0;
+       ((drm_r300_cmd_header_t *) rmesa->hw.r500fp_const.cmd)->r500fp.count = 0;
+
+       r500TranslateFragmentShader(rmesa, fp);
+       if (!fp->translated) {
+               fprintf(stderr, "%s: No valid fragment shader, exiting\n",
+                       __FUNCTION__);
+               return;
+       }
+       code = &fp->code;
+
+       r300SetupTextures(ctx);
+
+       R300_STATECHANGE(rmesa, fp);
+       rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
+
+       rmesa->hw.fp.cmd[R500_FP_CODE_ADDR] =
+           R500_US_CODE_START_ADDR(code->inst_offset) |
+           R500_US_CODE_END_ADDR(code->inst_end);
+       rmesa->hw.fp.cmd[R500_FP_CODE_RANGE] =
+           R500_US_CODE_RANGE_ADDR(code->inst_offset) |
+           R500_US_CODE_RANGE_SIZE(code->inst_end);
+       rmesa->hw.fp.cmd[R500_FP_CODE_OFFSET] =
+           R500_US_CODE_OFFSET_ADDR(0); /* FIXME when we add flow control */
+
+       R300_STATECHANGE(rmesa, r500fp);
+       /* Emit our shader... */
+       for (i = 0; i < code->inst_end+1; i++) {
+               rmesa->hw.r500fp.cmd[i*6+1] = code->inst[i].inst0;
+               rmesa->hw.r500fp.cmd[i*6+2] = code->inst[i].inst1;
+               rmesa->hw.r500fp.cmd[i*6+3] = code->inst[i].inst2;
+               rmesa->hw.r500fp.cmd[i*6+4] = code->inst[i].inst3;
+               rmesa->hw.r500fp.cmd[i*6+5] = code->inst[i].inst4;
+               rmesa->hw.r500fp.cmd[i*6+6] = code->inst[i].inst5;
+       }
+
+       bump_r500fp_count(rmesa->hw.r500fp.cmd, (code->inst_end + 1) * 6);
+
+       R300_STATECHANGE(rmesa, r500fp_const);
+       for (i = 0; i < code->const_nr; i++) {
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(code->constant[i][0]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(code->constant[i][1]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(code->constant[i][2]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(code->constant[i][3]);
+       }
+       bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
+
+}
+
 void r300UpdateShaderStates(r300ContextPtr rmesa)
 {
        GLcontext *ctx;
        ctx = rmesa->radeon.glCtx;
 
        r300UpdateTextureState(ctx);
+       r300SetEarlyZState(ctx);
 
-       r300SetupPixelShader(rmesa);
-       r300SetupTextures(ctx);
+       GLuint fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+       if (current_fragment_program_writes_depth(ctx))
+               fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
+       if (fgdepthsrc != rmesa->hw.fg_depth_src.cmd[1]) {
+               R300_STATECHANGE(rmesa, fg_depth_src);
+               rmesa->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+       }
+
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+               r500SetupPixelShader(rmesa);
+       else
+               r300SetupPixelShader(rmesa);
+
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+               r500SetupRSUnit(ctx);
+       else
+               r300SetupRSUnit(ctx);
 
        if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
                r300SetupVertexProgram(rmesa);
-       r300SetupRSUnit(ctx);
+
 }
 
 /**
@@ -2215,13 +2664,11 @@ void r300InitState(r300ContextPtr r300)
        switch (ctx->Visual.depthBits) {
        case 16:
                r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
-               depth_fmt = ZB_FORMAR_DEPTHFORMAT_16BIT_INT_Z;
-               r300->state.stencil.clear = 0x00000000;
+               depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
                break;
        case 24:
                r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
-               depth_fmt = ZB_FORMAR_DEPTHFORMAT_24BIT_INT_Z;
-               r300->state.stencil.clear = 0x00ff0000;
+               depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
                break;
        default:
                fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
@@ -2249,11 +2696,11 @@ void r300UpdateClipPlanes( GLcontext *ctx )
 {
        r300ContextPtr rmesa = R300_CONTEXT(ctx);
        GLuint p;
-       
+
        for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
                if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
                        GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
-                       
+
                        R300_STATECHANGE( rmesa, vpucp[p] );
                        rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
                        rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
@@ -2284,8 +2731,10 @@ void r300InitStateFuncs(struct dd_function_table *functions)
        functions->FrontFace = r300FrontFace;
        functions->ShadeModel = r300ShadeModel;
 
+       /* ARB_point_parameters */
+       functions->PointParameterfv = r300PointParameter;
+
        /* Stencil related */
-       functions->ClearStencil = r300ClearStencil;
        functions->StencilFuncSeparate = r300StencilFuncSeparate;
        functions->StencilMaskSeparate = r300StencilMaskSeparate;
        functions->StencilOpSeparate = r300StencilOpSeparate;