From a3c8de2fa7fba22647e5b3e8cfb05c85d1a5a980 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <rscheidegger@gmx.ch>
Date: Wed, 5 Oct 2005 11:42:44 +0000
Subject: [PATCH] Remove the tcl fallback for texture rectangle (by
 manipulating the texture matrix) (tested with texrect). Enable texgen for r/q
 coordinates (tested with projtex). Fix projected texcoords when an app uses
 TexCoord3x and the texture matrix to save on vertex size (fixes ut2k3 shadow
 projectors in tcl mode). From texgenmix, all cases with all texgen or no
 texgen work, with the exception of texgen enabled for s/t only, this one
 works with hw tcl, but not with vtxfmt (suspect issues with vtxfmt), the
 mixed cases do not work (which is expected, and should be rare in practice),
 with the exception of the first one which hits a tcl fallback.

---
 src/mesa/drivers/dri/radeon/radeon_context.c  |   6 +-
 src/mesa/drivers/dri/radeon/radeon_context.h  |   5 +-
 .../drivers/dri/radeon/radeon_maos_arrays.c   |  43 +++--
 .../drivers/dri/radeon/radeon_maos_vbtmp.h    |  24 ++-
 .../drivers/dri/radeon/radeon_maos_verts.c    |  28 ++-
 src/mesa/drivers/dri/radeon/radeon_state.c    | 167 +++++++++++++++---
 src/mesa/drivers/dri/radeon/radeon_state.h    |   4 +-
 src/mesa/drivers/dri/radeon/radeon_tcl.c      |   6 +-
 src/mesa/drivers/dri/radeon/radeon_tcl.h      |   3 -
 src/mesa/drivers/dri/radeon/radeon_texstate.c | 113 ++++++++----
 10 files changed, 299 insertions(+), 100 deletions(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index ae9dd354733..abb2c72e79c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -405,10 +405,12 @@ radeonCreateContext( const __GLcontextModes *glVisual,
 
    _math_matrix_ctr( &rmesa->TexGenMatrix[0] );
    _math_matrix_ctr( &rmesa->TexGenMatrix[1] );
-   _math_matrix_ctr( &rmesa->tmpmat );
+   _math_matrix_ctr( &rmesa->tmpmat[0] );
+   _math_matrix_ctr( &rmesa->tmpmat[1] );
    _math_matrix_set_identity( &rmesa->TexGenMatrix[0] );
    _math_matrix_set_identity( &rmesa->TexGenMatrix[1] );
-   _math_matrix_set_identity( &rmesa->tmpmat );
+   _math_matrix_set_identity( &rmesa->tmpmat[0] );
+   _math_matrix_set_identity( &rmesa->tmpmat[1] );
 
    driInitExtensions( ctx, card_extensions, GL_TRUE );
    if (rmesa->glCtx->Mesa_DXTn) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index 8c1c70c1228..3bd8f6ebc68 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -753,9 +753,10 @@ struct radeon_context {
    GLmatrix TexGenMatrix[RADEON_MAX_TEXTURE_UNITS];
    GLboolean recheck_texgen[RADEON_MAX_TEXTURE_UNITS];
    GLboolean TexGenNeedNormals[RADEON_MAX_TEXTURE_UNITS];
-   GLuint TexMatEnabled;
    GLuint TexGenEnabled;
-   GLmatrix tmpmat;
+   GLuint NeedTexMatrix;
+   GLuint TexMatColSwap;
+   GLmatrix tmpmat[RADEON_MAX_TEXTURE_UNITS];
    GLuint last_ReallyEnabled;
 
    /* VBI
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
index 98f66898c73..b5c6f12248c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
@@ -387,6 +387,7 @@ static void emit_tex_vector( GLcontext *ctx,
 
    switch (size) {
    case 4: emitsize = 3; break;
+   case 3: emitsize = 3; break;
    default: emitsize = 2; break;
    }
 
@@ -416,7 +417,7 @@ static void emit_tex_vector( GLcontext *ctx,
       emit_vec8( ctx, rvb, data, stride, count );
       break;
    case 3:
-      emit_vec8( ctx, rvb, data, stride, count );
+      emit_vec12( ctx, rvb, data, stride, count );
       break;
    case 4:
       emit_stq_vec( ctx, rvb, data, stride, count );
@@ -529,38 +530,52 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
    if (inputs & VERT_BIT_TEX0) {
       if (!rmesa->tcl.tex[0].buf)
-	 emit_tex_vector( ctx, 
-			  &(rmesa->tcl.tex[0]), 
+	 emit_tex_vector( ctx,
+			  &(rmesa->tcl.tex[0]),
 			  (char *)VB->TexCoordPtr[0]->data,
 			  VB->TexCoordPtr[0]->size,
 			  VB->TexCoordPtr[0]->stride,
 			  count );
 
-      switch( VB->TexCoordPtr[0]->size ) {
-      case 4:
-	 vtx |= RADEON_TCL_VTX_Q0; 
+      vfmt |= RADEON_CP_VC_FRMT_ST0;
+      /* assume we need the 3rd coord if texgen is active for r/q OR at least 3
+         coords are submitted. This may not be 100% correct */
+      if ( (VB->TexCoordPtr[0]->size >= 3) {
+	 vtx |= RADEON_TCL_VTX_Q0;
 	 vfmt |= RADEON_CP_VC_FRMT_Q0;
-      default: 
-	 vfmt |= RADEON_CP_VC_FRMT_ST0;
+      }
+      if ( (ctx->Texture.Unit[0].TexGenEnabled & (R_BIT | Q_BIT)) )
+	 vtx |= RADEON_TCL_VTX_Q0;
+      else if (VB->TexCoordPtr[0]->size >= 3) {
+	 GLuint swaptexmatcol = (VB->TexCoordPtr[0]->size - 3);
+	 if ((rmesa->NeedTexMatrix & 1) &&
+		(swaptexmatcol != (rmesa->TexMatColSwap & 1)))
+	    radeonUploadTexMatrix( rmesa, rmesa->tmpmat[0].m, 0, swaptexmatcol ) ;
       }
       component[nr++] = &rmesa->tcl.tex[0];
    }
 
    if (inputs & VERT_BIT_TEX1) {
       if (!rmesa->tcl.tex[1].buf)
-	 emit_tex_vector( ctx, 
-			  &(rmesa->tcl.tex[1]), 
+	 emit_tex_vector( ctx,
+			  &(rmesa->tcl.tex[1]),
 			  (char *)VB->TexCoordPtr[1]->data,
 			  VB->TexCoordPtr[1]->size,
 			  VB->TexCoordPtr[1]->stride,
 			  count );
 	 
-      switch( VB->TexCoordPtr[1]->size ) {
-      case 4: 
+      vfmt |= RADEON_CP_VC_FRMT_ST1;
+      if ( (VB->TexCoordPtr[1]->size >= 3) {
 	 vtx |= RADEON_TCL_VTX_Q1;
 	 vfmt |= RADEON_CP_VC_FRMT_Q1;
-      default: 
-	 vfmt |= RADEON_CP_VC_FRMT_ST1;
+      }
+      if ( (ctx->Texture.Unit[1].TexGenEnabled & (R_BIT | Q_BIT)) )
+	 vtx |= RADEON_TCL_VTX_Q1;
+      else if (VB->TexCoordPtr[1]->size >= 3) {
+	 GLuint swaptexmatcol = (VB->TexCoordPtr[1]->size - 3);
+	 if (((rmesa->NeedTexMatrix >> 1) & 1) &&
+		(swaptexmatcol != ((rmesa->TexMatColSwap >> 1) & 1)))
+	    radeonUploadTexMatrix( rmesa, rmesa->tmpmat[1].m, 1, swaptexmatcol ) ;
       }
       component[nr++] = &rmesa->tcl.tex[1];
    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h b/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
index c16234a9437..8a07a01cb21 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
@@ -47,6 +47,7 @@ static void TAG(emit)( GLcontext *ctx,
    GLuint tc0_stride, tc1_stride, col_stride, spec_stride, fog_stride;
    GLuint tc2_stride, norm_stride;
    GLuint fill_tex = 0;
+   GLuint rqcoordsnoswap = 0;
    GLuint (*coord)[4];
    GLuint coord_stride; /* object coordinates */
    GLubyte dummy[4];
@@ -65,9 +66,14 @@ static void TAG(emit)( GLcontext *ctx,
 	 const GLuint t2 = GET_TEXSOURCE(2);
 	 tc2 = (GLuint (*)[4])VB->TexCoordPtr[t2]->data;
 	 tc2_stride = VB->TexCoordPtr[t2]->stride;
-	 if (DO_PTEX && VB->TexCoordPtr[t2]->size < 4) {
+	 if (DO_PTEX && VB->TexCoordPtr[t2]->size < 3) {
+	 /* since DO_PTEX is only true when we have 3 or more coords
+	    in the first place we don't really need this right? */
 	    fill_tex |= (1<<2);
 	 }
+	 else if (DO_PTEX && VB->TexCoordPtr[t2]->size < 4) {
+	    rqcoordsnoswap |= (1<<2);
+	 }
       } else {
 	 tc2 = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_TEX2];
 	 tc2_stride = 0;
@@ -79,9 +85,12 @@ static void TAG(emit)( GLcontext *ctx,
 	 const GLuint t1 = GET_TEXSOURCE(1);
 	 tc1 = (GLuint (*)[4])VB->TexCoordPtr[t1]->data;
 	 tc1_stride = VB->TexCoordPtr[t1]->stride;
-	 if (DO_PTEX && VB->TexCoordPtr[t1]->size < 4) {
+	 if (DO_PTEX && VB->TexCoordPtr[t1]->size < 3) {
 	    fill_tex |= (1<<1);
 	 }
+	 else if (DO_PTEX && VB->TexCoordPtr[t1]->size < 4) {
+	    rqcoordsnoswap |= (1<<1);
+	 }
       } else {
 	 tc1 = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_TEX1];
 	 tc1_stride = 0;
@@ -93,9 +102,12 @@ static void TAG(emit)( GLcontext *ctx,
 	 const GLuint t0 = GET_TEXSOURCE(0);
 	 tc0_stride = VB->TexCoordPtr[t0]->stride;
 	 tc0 = (GLuint (*)[4])VB->TexCoordPtr[t0]->data;
-	 if (DO_PTEX && VB->TexCoordPtr[t0]->size < 4) {
+	 if (DO_PTEX && VB->TexCoordPtr[t0]->size < 3) {
 	    fill_tex |= (1<<0);
 	 }
+	 else if (DO_PTEX && VB->TexCoordPtr[t0]->size < 4) {
+	    rqcoordsnoswap |= (1<<0);
+	 }
       } else {
 	 tc0 = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_TEX0];
 	 tc0_stride = 0;
@@ -213,6 +225,8 @@ static void TAG(emit)( GLcontext *ctx,
 	    if (DO_PTEX) {
 	       if (fill_tex & (1<<0))
 		  v[2].f = 1.0;
+	       else if (rqcoordsnoswap & (1<<0))
+		  v[2].ui = tc0[0][2];
 	       else
 		  v[2].ui = tc0[0][3];
 	       if (TCL_DEBUG) fprintf(stderr, "%.2f ", v[2].f);
@@ -229,6 +243,8 @@ static void TAG(emit)( GLcontext *ctx,
 	    if (DO_PTEX) {
 	       if (fill_tex & (1<<1))
 		  v[2].f = 1.0;
+	       else if (rqcoordsnoswap & (1<<1))
+		  v[2].ui = tc1[0][2];
 	       else
 		  v[2].ui = tc1[0][3];
 	       if (TCL_DEBUG) fprintf(stderr, "%.2f ", v[2].f);
@@ -244,6 +260,8 @@ static void TAG(emit)( GLcontext *ctx,
 	    if (DO_PTEX) {
 	       if (fill_tex & (1<<2))
 		  v[2].f = 1.0;
+	       else if (rqcoordsnoswap & (1<<2))
+		  v[2].ui = tc2[0][2];
 	       else
 		  v[2].ui = tc2[0][3];
 	       v += 3;
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
index 8cb08a812a5..f3221e60d8f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
@@ -243,7 +243,7 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       init_tcl_verts();
       firsttime = 0;
    }
-		     
+
    if (1) {
       req |= RADEON_CP_VC_FRMT_Z;
       if (VB->ObjPtr->size == 4) {
@@ -254,7 +254,7 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
    if (inputs & VERT_BIT_NORMAL) {
       req |= RADEON_CP_VC_FRMT_N0;
    }
-   
+
    if (inputs & VERT_BIT_COLOR0) {
       req |= RADEON_CP_VC_FRMT_PKCOLOR;
    }
@@ -265,20 +265,38 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
    if (inputs & VERT_BIT_TEX0) {
       req |= RADEON_CP_VC_FRMT_ST0;
-
-      if (VB->TexCoordPtr[0]->size == 4) {
+      /* assume we need the 3rd coord if texgen is active for r/q OR at least 3
+         coords are submitted. This may not be 100% correct */
+      if (VB->TexCoordPtr[0]->size >= 3) {
 	 req |= RADEON_CP_VC_FRMT_Q0;
 	 vtx |= RADEON_TCL_VTX_Q0;
       }
+      if ( (ctx->Texture.Unit[0].TexGenEnabled & (R_BIT | Q_BIT)) )
+	 vtx |= RADEON_TCL_VTX_Q0;
+      else if (VB->TexCoordPtr[0]->size >= 3) {
+	 GLuint swaptexmatcol = (VB->TexCoordPtr[0]->size - 3);
+	 if ((rmesa->NeedTexMatrix & 1) &&
+		(swaptexmatcol != (rmesa->TexMatColSwap & 1)))
+	    radeonUploadTexMatrix( rmesa, rmesa->tmpmat[0].m, 0, swaptexmatcol ) ;
+      }
    }
 
+
    if (inputs & VERT_BIT_TEX1) {
       req |= RADEON_CP_VC_FRMT_ST1;
 
-      if (VB->TexCoordPtr[1]->size == 4) {
+      if (VB->TexCoordPtr[1]->size >= 3) {
 	 req |= RADEON_CP_VC_FRMT_Q1;
 	 vtx |= RADEON_TCL_VTX_Q1;
       }
+      if ( (ctx->Texture.Unit[1].TexGenEnabled & (R_BIT | Q_BIT)) )
+	 vtx |= RADEON_TCL_VTX_Q1;
+      else if (VB->TexCoordPtr[1]->size >= 3) {
+	 GLuint swaptexmatcol = (VB->TexCoordPtr[1]->size - 3);
+	 if (((rmesa->NeedTexMatrix >> 1) & 1) &&
+		(swaptexmatcol != ((rmesa->TexMatColSwap >> 1) & 1)))
+	    radeonUploadTexMatrix( rmesa, rmesa->tmpmat[1].m, 1, swaptexmatcol ) ;
+      }
    }
 
    if (vtx != rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT]) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index 3c7767768b2..d99a2f4c203 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -2024,7 +2024,105 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
  * Deferred state management - matrices, textures, other?
  */
 
+static void texmat_set_texrect( radeonContextPtr rmesa,
+				struct gl_texture_object *tObj, GLuint unit )
+{
+   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
+   _math_matrix_set_identity( &rmesa->tmpmat[unit] );
+   rmesa->tmpmat[unit].m[0] = 1.0 / baseImage->Width;
+   rmesa->tmpmat[unit].m[5] = 1.0 / baseImage->Height;
+
+}
+
+static void texmat_fixup_texrect( radeonContextPtr rmesa,
+				  struct gl_texture_object *tObj, GLuint unit )
+{
+   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
+   GLuint i;
+   for (i = 0; i < 4; i++) {
+      rmesa->tmpmat[unit].m[i] = rmesa->tmpmat[unit].m[i] / baseImage->Width;
+      rmesa->tmpmat[unit].m[i+4] = rmesa->tmpmat[unit].m[i+4] / baseImage->Height;
+   }}
+
+
+void radeonUploadTexMatrix( radeonContextPtr rmesa, GLfloat *src,
+			    int unit, GLboolean swapcols )
+{
+/* Here's how this works: on r100, only 3 tex coords can be submitted, so the
+   vector looks like this probably: (s t r|q 0) (not sure if the last coord
+   is hardwired to 0, could be 1 too). Interestingly, it actually looks like
+   texgen generates all 4 coords, at least tests with projtex indicated that.
+   So: if we need the q coord in the end (solely determined by the texture
+   target, i.e. 2d / 1d / texrect targets) we swap the third and 4th row.
+   Additionally, if we don't have texgen but 4 tex coords submitted, we swap
+   column 3 and 4 (for the 2d / 1d / texrect targets) since the the q coord
+   will get submitted in the "wrong", i.e. 3rd, slot.
+   If an app submits 3 coords for 2d targets, we assume it is saving on vertex
+   size and using the texture matrix to swap the r and q coords around (ut2k3
+   does exactly that), so we don't need the 3rd / 4th column swap - still need
+   the 3rd / 4th row swap of course. This will potentially break for apps which
+   use TexCoord3x just for fun. Additionally, it will never work if an app uses
+   an "advanced" texture matrix and relies on all 4 texcoord inputs to generate
+   the maximum needed 3. This seems impossible to do with hw tcl on r100, and
+   incredibly hard to detect so we can't just fallback in such a case. Assume
+   it never happens... - rs
+*/
+
+   int idx = TEXMAT_0 + unit;
+   float *dest = ((float *)RADEON_DB_STATE( mat[idx] )) + MAT_ELT_0;
+   int i;
+   struct gl_texture_unit tUnit = rmesa->glCtx->Texture.Unit[unit];
+
+   rmesa->TexMatColSwap &= ~(1 << unit);
+   if ((tUnit._ReallyEnabled & (TEXTURE_3D_BIT | TEXTURE_CUBE_BIT)) == 0) {
+      if (swapcols) {
+	 rmesa->TexMatColSwap |= 1 << unit;
+	 /* attention some elems are swapped 2 times! */
+	 *dest++ = src[0];
+	 *dest++ = src[4];
+	 *dest++ = src[12];
+	 *dest++ = src[8];
+	 *dest++ = src[1];
+	 *dest++ = src[5];
+	 *dest++ = src[13];
+	 *dest++ = src[9];
+	 *dest++ = src[2];
+	 *dest++ = src[6];
+	 *dest++ = src[15];
+	 *dest++ = src[11];
+	 /* those last 4 are probably never used */
+	 *dest++ = src[3];
+	 *dest++ = src[7];
+	 *dest++ = src[14];
+	 *dest++ = src[10];
+      }
+      else {
+	 for (i = 0; i < 2; i++) {
+	    *dest++ = src[i];
+	    *dest++ = src[i+4];
+	    *dest++ = src[i+8];
+	    *dest++ = src[i+12];
+	 }
+	 for (i = 3; i >= 2; i--) {
+	    *dest++ = src[i];
+	    *dest++ = src[i+4];
+	    *dest++ = src[i+8];
+	    *dest++ = src[i+12];
+	 }
+      }
+   }
+   else {
+      /* never used currently - no swapping needed at all presumably */
+      for (i = 0 ; i < 4 ; i++) {
+	 *dest++ = src[i];
+	 *dest++ = src[i+4];
+	 *dest++ = src[i+8];
+	 *dest++ = src[i+12];
+      }
+   }
 
+   RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
+}
 
 
 static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
@@ -2057,42 +2155,53 @@ static void update_texturematrix( GLcontext *ctx )
    GLuint tpc = rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL];
    GLuint vs = rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL];
    int unit;
-
-   rmesa->TexMatEnabled = 0;
+   GLuint texMatEnabled = 0;
+   rmesa->NeedTexMatrix = 0;
+   rmesa->TexMatColSwap = 0;
 
    for (unit = 0 ; unit < 2; unit++) {
-      if (!ctx->Texture.Unit[unit]._ReallyEnabled) {
-      }
-      else if (ctx->TextureMatrixStack[unit].Top->type != MATRIX_IDENTITY) {
-	 GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
-	 
-	 rmesa->TexMatEnabled |= (RADEON_TEXGEN_TEXMAT_0_ENABLE|
-				  RADEON_TEXMAT_0_ENABLE) << unit;
-
-	 if (rmesa->TexGenEnabled & (RADEON_TEXMAT_0_ENABLE << unit)) {
-	    /* Need to preconcatenate any active texgen 
-	     * obj/eyeplane matrices:
-	     */
-	    _math_matrix_mul_matrix( &rmesa->tmpmat,
+      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
+	 GLboolean needMatrix = GL_FALSE;
+	 if (ctx->TextureMatrixStack[unit].Top->type != MATRIX_IDENTITY) {
+	    needMatrix = GL_TRUE;
+	    texMatEnabled |= (RADEON_TEXGEN_TEXMAT_0_ENABLE |
+			      RADEON_TEXMAT_0_ENABLE) << unit;
+
+	    if (rmesa->TexGenEnabled & (RADEON_TEXMAT_0_ENABLE << unit)) {
+	       /* Need to preconcatenate any active texgen
+	        * obj/eyeplane matrices:
+	        */
+	       _math_matrix_mul_matrix( &rmesa->tmpmat[unit],
 				     ctx->TextureMatrixStack[unit].Top,
 				     &rmesa->TexGenMatrix[unit] );
-	    upload_matrix( rmesa, rmesa->tmpmat.m, TEXMAT_0+unit );
+	    }
+	    else {
+	       _math_matrix_copy( &rmesa->tmpmat[unit],
+		  ctx->TextureMatrixStack[unit].Top );
+	    }
 	 }
-	 else {
-	    rmesa->TexMatEnabled |= 
-	       (RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
-	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m, 
-			   TEXMAT_0+unit );
+	 else if (rmesa->TexGenEnabled & (RADEON_TEXMAT_0_ENABLE << unit)) {
+	    _math_matrix_copy( &rmesa->tmpmat[unit], &rmesa->TexGenMatrix[unit] );
+	    needMatrix = GL_TRUE;
+	 }
+	 if (ctx->Texture.Unit[unit]._ReallyEnabled == TEXTURE_RECT_BIT) {
+	    texMatEnabled |= (RADEON_TEXGEN_TEXMAT_0_ENABLE |
+			      RADEON_TEXMAT_0_ENABLE) << unit;
+	    if (needMatrix)
+	       texmat_fixup_texrect( rmesa, ctx->Texture.Unit[unit]._Current, unit );
+	    else
+	       texmat_set_texrect( rmesa, ctx->Texture.Unit[unit]._Current, unit );
+	    needMatrix = GL_TRUE;
+	 }
+	 if (needMatrix) {
+	    rmesa->NeedTexMatrix |= 1 << unit;
+	    radeonUploadTexMatrix( rmesa, rmesa->tmpmat[unit].m, unit,
+			!ctx->Texture.Unit[unit].TexGenEnabled );
 	 }
-      }
-      else if (rmesa->TexGenEnabled & (RADEON_TEXMAT_0_ENABLE << unit)) {
-	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m, 
-			TEXMAT_0+unit );
       }
    }
 
-
-   tpc = (rmesa->TexMatEnabled | rmesa->TexGenEnabled);
+   tpc = (texMatEnabled | rmesa->TexGenEnabled);
 
    vs &= ~((0xf << RADEON_TCL_TEX_0_OUTPUT_SHIFT) |
 	   (0xf << RADEON_TCL_TEX_1_OUTPUT_SHIFT));
@@ -2109,7 +2218,7 @@ static void update_texturematrix( GLcontext *ctx )
 
    if (tpc != rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL] ||
        vs != rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL]) {
-      
+
       RADEON_STATECHANGE(rmesa, tcl);
       rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL] = tpc;
       rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] = vs;
@@ -2188,7 +2297,7 @@ void radeonValidateState( GLcontext *ctx )
     */
    if (new_state & _NEW_TEXTURE_MATRIX) {
       update_texturematrix( ctx );
-   }      
+   }
 
    if (new_state & (_NEW_LIGHT|_NEW_MODELVIEW|_MESA_NEW_NEED_EYE_COORDS)) {
       update_light( ctx );
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.h b/src/mesa/drivers/dri/radeon/radeon_state.h
index c9f5c05cf4d..a1afa500076 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.h
+++ b/src/mesa/drivers/dri/radeon/radeon_state.h
@@ -49,7 +49,9 @@ extern void radeonSetCliprects( radeonContextPtr rmesa );
 extern void radeonRecalcScissorRects( radeonContextPtr rmesa );
 extern void radeonUpdateViewportOffset( GLcontext *ctx );
 extern void radeonUpdateWindow( GLcontext *ctx );
-extern void radeonUpdateDrawBuffer(GLcontext *ctx);
+extern void radeonUpdateDrawBuffer( GLcontext *ctx );
+extern void radeonUploadTexMatrix( radeonContextPtr rmesa, GLfloat *src,
+				       int unit, GLboolean swapcols );
 
 extern void radeonValidateState( GLcontext *ctx );
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
index b13042d9658..fb0f10a862f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
@@ -320,6 +320,7 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
 
    for (i = 0 ; i < ctx->Const.MaxTextureUnits; i++) {
       if (ctx->Texture.Unit[i]._ReallyEnabled) {
+      /* TODO: probably should not emit texture coords when texgen is enabled */
 	 if (rmesa->TexGenNeedNormals[i]) {
 	    inputs |= VERT_BIT_NORMAL;
 	 }
@@ -444,10 +445,7 @@ static char *fallbackStrings[] = {
    "Texgen unit 0",
    "Texgen unit 1",
    "Texgen unit 2",
-   "User disable",
-   "texture rectangle unit 0",
-   "texture rectangle unit 1",
-   "texture rectangle unit 2"
+   "User disable"
 };
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.h b/src/mesa/drivers/dri/radeon/radeon_tcl.h
index e292d23037b..263b803d625 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.h
@@ -55,9 +55,6 @@ extern void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode );
 #define RADEON_TCL_FALLBACK_TEXGEN_1          0x20 /* texgen, unit 1 */
 #define RADEON_TCL_FALLBACK_TEXGEN_2          0x40 /* texgen, unit 2 */
 #define RADEON_TCL_FALLBACK_TCL_DISABLE       0x80 /* user disable */
-#define RADEON_TCL_FALLBACK_TEXRECT_0         0x100 /* texture rectangle */
-#define RADEON_TCL_FALLBACK_TEXRECT_1         0x200 /* texture rectangle */
-#define RADEON_TCL_FALLBACK_TEXRECT_2         0x400 /* texture rectangle */
 
 #define RADEON_MAX_TCL_VERTSIZE (15*4)
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index e12fd41ad91..03324941bae 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -834,7 +834,9 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
 static void set_texgen_matrix( radeonContextPtr rmesa, 
 			       GLuint unit,
 			       const GLfloat *s_plane,
-			       const GLfloat *t_plane )
+			       const GLfloat *t_plane,
+			       const GLfloat *r_plane,
+			       const GLfloat *q_plane )
 {
    rmesa->TexGenMatrix[unit].m[0]  = s_plane[0];
    rmesa->TexGenMatrix[unit].m[4]  = s_plane[1];
@@ -846,78 +848,119 @@ static void set_texgen_matrix( radeonContextPtr rmesa,
    rmesa->TexGenMatrix[unit].m[9]  = t_plane[2];
    rmesa->TexGenMatrix[unit].m[13] = t_plane[3];
 
+   rmesa->TexGenMatrix[unit].m[2]  = r_plane[0];
+   rmesa->TexGenMatrix[unit].m[6]  = r_plane[1];
+   rmesa->TexGenMatrix[unit].m[10] = r_plane[2];
+   rmesa->TexGenMatrix[unit].m[14] = r_plane[3];
+
+   rmesa->TexGenMatrix[unit].m[3]  = q_plane[0];
+   rmesa->TexGenMatrix[unit].m[7]  = q_plane[1];
+   rmesa->TexGenMatrix[unit].m[11] = q_plane[2];
+   rmesa->TexGenMatrix[unit].m[15] = q_plane[3];
+
    rmesa->TexGenEnabled |= RADEON_TEXMAT_0_ENABLE << unit;
    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
 }
 
-/* Ignoring the Q texcoord for now.
- *
- * Returns GL_FALSE if fallback required.  
+/* Returns GL_FALSE if fallback required.
  */
 static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
-{  
+{
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
    GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
    GLuint tmp = rmesa->TexGenEnabled;
-
-   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
-   rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
-   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
+   static const GLfloat reflect[16] = {
+      -1,  0,  0,  0,
+       0, -1,  0,  0,
+       0,  0,  -1, 0,
+       0,  0,  0,  1 };
+
+   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE << unit);
+   rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE << unit);
+   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK << inputshift);
    rmesa->TexGenNeedNormals[unit] = 0;
 
-   if ((texUnit->TexGenEnabled & (S_BIT|T_BIT)) == 0) {
+   if ((texUnit->TexGenEnabled & (S_BIT|T_BIT|R_BIT|Q_BIT)) == 0) {
       /* Disabled, no fallback:
        */
-      rmesa->TexGenEnabled |= 
-	 (RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
+      rmesa->TexGenEnabled |=
+	 (RADEON_TEXGEN_INPUT_TEXCOORD_0 + unit) << inputshift;
       return GL_TRUE;
    }
-   else if (texUnit->TexGenEnabled & Q_BIT) {
-      /* Very easy to do this, in fact would remove a fallback case
-       * elsewhere, but I haven't done it yet...  Fallback: 
-       */
-      if (RADEON_DEBUG & DEBUG_FALLBACKS) 
-	fprintf(stderr, "fallback Q_BIT\n");
-      return GL_FALSE;
+   /* the r100 cannot do texgen for some coords and not for others
+    * we do not detect such cases (certainly can't do it here) and just
+    * ASSUME that when S and T are texgen enabled we do not need other
+    * non-texgen enabled coords, no matter if the R and Q bits are texgen
+    * enabled. Still check for mixed mode texgen for all coords.
+    */
+   else if ( (texUnit->TexGenEnabled & S_BIT) &&
+	     (texUnit->TexGenEnabled & T_BIT) &&
+	     (texUnit->GenModeS == texUnit->GenModeT) ) {
+      if ( ((texUnit->TexGenEnabled & R_BIT) &&
+	    (texUnit->GenModeS != texUnit->GenModeR)) ||
+	   ((texUnit->TexGenEnabled & Q_BIT) &&
+	    (texUnit->GenModeS != texUnit->GenModeQ)) ) {
+	 /* Mixed modes, fallback:
+	  */
+	 if (RADEON_DEBUG & DEBUG_FALLBACKS)
+	    fprintf(stderr, "fallback mixed texgen\n");
+	 return GL_FALSE;
+      }
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_TEXMAT_0_ENABLE << unit;
    }
-   else if ((texUnit->TexGenEnabled & (S_BIT|T_BIT)) != (S_BIT|T_BIT) ||
-	    texUnit->GenModeS != texUnit->GenModeT) {
-      /* Mixed modes, fallback:
-       */
-      if (RADEON_DEBUG & DEBUG_FALLBACKS) 
-        fprintf(stderr, "fallback mixed texgen\n");
+   else {
+   /* some texgen mode not including both S and T bits */
+      if (RADEON_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "fallback mixed texgen/nontexgen\n");
       return GL_FALSE;
    }
-   else
-      rmesa->TexGenEnabled |= RADEON_TEXGEN_TEXMAT_0_ENABLE << unit;
+
+   if ((texUnit->TexGenEnabled & (R_BIT | Q_BIT)) != 0) {
+      /* need this here for vtxfmt presumably. Argh we need to set
+         this from way too many places, would be much easier if we could leave
+         tcl q coord always enabled as on r200) */
+      RADEON_STATECHANGE( rmesa, tcl );
+      if (unit == 0)
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_Q0;
+      else
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_Q1;
+   }
 
    switch (texUnit->GenModeS) {
    case GL_OBJECT_LINEAR:
       rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_OBJ << inputshift;
-      set_texgen_matrix( rmesa, unit, 
+      set_texgen_matrix( rmesa, unit,
 			 texUnit->ObjectPlaneS,
-			 texUnit->ObjectPlaneT);
+			 texUnit->ObjectPlaneT,
+			 texUnit->ObjectPlaneR,
+			 texUnit->ObjectPlaneQ);
       break;
 
    case GL_EYE_LINEAR:
       rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE << inputshift;
-      set_texgen_matrix( rmesa, unit, 
+      set_texgen_matrix( rmesa, unit,
 			 texUnit->EyePlaneS,
-			 texUnit->EyePlaneT);
+			 texUnit->EyePlaneT,
+			 texUnit->EyePlaneR,
+			 texUnit->EyePlaneQ);
       break;
 
    case GL_REFLECTION_MAP_NV:
       rmesa->TexGenNeedNormals[unit] = GL_TRUE;
-      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_REFLECT<<inputshift;
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_REFLECT << inputshift;
+      /* TODO: unknown if this is needed/correct */
+      set_texgen_matrix( rmesa, unit, reflect, reflect + 4,
+			reflect + 8, reflect + 12 );
       break;
 
    case GL_NORMAL_MAP_NV:
       rmesa->TexGenNeedNormals[unit] = GL_TRUE;
-      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_NORMAL<<inputshift;
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_NORMAL << inputshift;
       break;
 
    case GL_SPHERE_MAP:
+      /* the mode which everyone uses :-( */
    default:
       /* Unsupported mode, fallback:
        */
@@ -1131,11 +1174,7 @@ static GLboolean radeonUpdateTextureUnit( GLcontext *ctx, int unit )
 {
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 
-   TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_TEXRECT_0 << unit, 0 );
-
    if ( texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT) ) {
-      TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_TEXRECT_0 << unit, 1 );
-
       return (enable_tex_rect( ctx, unit ) &&
 	      update_tex_common( ctx, unit ));
    }
-- 
2.30.2