Merge commit 'origin/master' into gallium-map-range

[mesa.git] / src / mesa / drivers / dri / radeon / radeon_texstate.c
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c

index d17d9db4ac48568f4e5a379aafba42ad9bbd7ea5..b165205c0939ea633b3497d68adc50aceaa8b394 100644 (file)
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -1,4 +1,3 @@
-/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_texstate.c,v 1.6 2002/12/16 16:18:59 dawes Exp $ */
  /**************************************************************************
  
  Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
@@ -34,13 +33,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   *   Gareth Hughes <gareth@valinux.com>
   */
  
-#include "glheader.h"
-#include "imports.h"
-#include "colormac.h"
-#include "context.h"
-#include "macros.h"
-#include "texformat.h"
-#include "enums.h"
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/colormac.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/texformat.h"
+#include "main/texobj.h"
+#include "main/enums.h"
  
  #include "radeon_context.h"
  #include "radeon_state.h"
@@ -55,6 +55,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  #define RADEON_TXFORMAT_AL88      RADEON_TXFORMAT_AI88
  #define RADEON_TXFORMAT_YCBCR     RADEON_TXFORMAT_YVYU422
  #define RADEON_TXFORMAT_YCBCR_REV RADEON_TXFORMAT_VYUY422
+#define RADEON_TXFORMAT_RGB_DXT1  RADEON_TXFORMAT_DXT1
+#define RADEON_TXFORMAT_RGBA_DXT1 RADEON_TXFORMAT_DXT1
+#define RADEON_TXFORMAT_RGBA_DXT3 RADEON_TXFORMAT_DXT23
+#define RADEON_TXFORMAT_RGBA_DXT5 RADEON_TXFORMAT_DXT45
  
  #define _COLOR(f) \
      [ MESA_FORMAT_ ## f ] = { RADEON_TXFORMAT_ ## f, 0 }
@@ -80,7 +84,7 @@ tx_table[] =
     _ALPHA_REV(RGBA8888),
     _ALPHA(ARGB8888),
     _ALPHA_REV(ARGB8888),
-   _INVALID(RGB888),
+   [ MESA_FORMAT_RGB888 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
     _COLOR(RGB565),
     _COLOR_REV(RGB565),
     _ALPHA(ARGB4444),
@@ -95,6 +99,12 @@ tx_table[] =
     _INVALID(CI8),
     _YUV(YCBCR),
     _YUV(YCBCR_REV),
+   _INVALID(RGB_FXT1),
+   _INVALID(RGBA_FXT1),
+   _COLOR(RGB_DXT1),
+   _ALPHA(RGBA_DXT1),
+   _ALPHA(RGBA_DXT3),
+   _ALPHA(RGBA_DXT5),
  };
  
  #undef _COLOR
@@ -117,32 +127,40 @@ static void radeonSetTexImages( radeonContextPtr rmesa,
  {
     radeonTexObjPtr t = (radeonTexObjPtr)tObj->DriverData;
     const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
-   GLint curOffset;
-   GLint i;
+   GLint curOffset, blitWidth;
+   GLint i, texelBytes;
     GLint numLevels;
     GLint log2Width, log2Height, log2Depth;
  
     /* Set the hardware texture format
      */
-
-   t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
-                      RADEON_TXFORMAT_ALPHA_IN_MAP);
-   t->pp_txfilter &= ~RADEON_YUV_TO_RGB;
-
-   if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
-      t->pp_txformat |= tx_table[ baseImage->TexFormat->MesaFormat ].format;
-      t->pp_txfilter |= tx_table[ baseImage->TexFormat->MesaFormat ].filter;
-   }
-   else {
-      _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
-      return;
+   if ( !t->image_override ) {
+      t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
+                          RADEON_TXFORMAT_ALPHA_IN_MAP);
+      t->pp_txfilter &= ~RADEON_YUV_TO_RGB;
+
+      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
+         t->pp_txformat |= tx_table[ baseImage->TexFormat->MesaFormat ].format;
+         t->pp_txfilter |= tx_table[ baseImage->TexFormat->MesaFormat ].filter;
+      }
+      else {
+         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
+         return;
+      }
     }
  
+   texelBytes = baseImage->TexFormat->TexelBytes;
  
     /* Compute which mipmap levels we really want to send to the hardware.
      */
  
-   driCalculateTextureFirstLastLevel( (driTextureObject *) t );
+   if (tObj->Target != GL_TEXTURE_CUBE_MAP)
+      driCalculateTextureFirstLastLevel( (driTextureObject *) t );
+   else {
+      /* r100 can't handle mipmaps for cube/3d textures, so don't waste
+         memory for them */
+      t->base.firstLevel = t->base.lastLevel = tObj->BaseLevel;
+   }
     log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
     log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
     log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
@@ -156,6 +174,34 @@ static void radeonSetTexImages( radeonContextPtr rmesa,
      * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
      */
     curOffset = 0;
+   blitWidth = BLIT_WIDTH_BYTES;
+   t->tile_bits = 0;
+
+   /* figure out if this texture is suitable for tiling. */
+   if (texelBytes && (tObj->Target != GL_TEXTURE_RECTANGLE_NV)) {
+      if (rmesa->texmicrotile && (baseImage->Height > 1)) {
+        /* allow 32 (bytes) x 1 mip (which will use two times the space
+           the non-tiled version would use) max if base texture is large enough */
+        if ((numLevels == 1) ||
+          (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
+              (baseImage->Width * texelBytes > 64)) ||
+           ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
+           /* R100 has two microtile bits (only the txoffset reg, not the blitter)
+              weird: X2 + OPT: 32bit correct, 16bit completely hosed
+                     X2: 32bit correct, 16bit correct
+                     OPT: 32bit large mips correct, small mips hosed, 16bit completely hosed */
+           t->tile_bits |= RADEON_TXO_MICRO_TILE_X2 /*| RADEON_TXO_MICRO_TILE_OPT*/;
+        }
+      }
+      if ((baseImage->Width * texelBytes >= 256) && (baseImage->Height >= 16)) {
+        /* R100 disables macro tiling only if mip width is smaller than 256 bytes, and not
+           in the case if height is smaller than 16 (not 100% sure), as does the r200,
+           so need to disable macro tiling in that case */
+        if ((numLevels == 1) || ((baseImage->Width * texelBytes / baseImage->Height) <= 4)) {
+           t->tile_bits |= RADEON_TXO_MACRO_TILE;
+        }
+      }
+   }
  
     for (i = 0; i < numLevels; i++) {
        const struct gl_texture_image *texImage;
@@ -167,31 +213,61 @@ static void radeonSetTexImages( radeonContextPtr rmesa,
  
        /* find image size in bytes */
        if (texImage->IsCompressed) {
-         size = texImage->CompressedSize;
+      /* need to calculate the size AFTER padding even though the texture is
+         submitted without padding.
+         Only handle pot textures currently - don't know if npot is even possible,
+         size calculation would certainly need (trivial) adjustments.
+         Align (and later pad) to 32byte, not sure what that 64byte blit width is
+         good for? */
+         if ((t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) == RADEON_TXFORMAT_DXT1) {
+            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
+            if ((texImage->Width + 3) < 8) /* width one block */
+               size = texImage->CompressedSize * 4;
+            else if ((texImage->Width + 3) < 16)
+               size = texImage->CompressedSize * 2;
+            else size = texImage->CompressedSize;
+         }
+         else /* DXT3/5, 16 bytes per block */
+            if ((texImage->Width + 3) < 8)
+               size = texImage->CompressedSize * 2;
+            else size = texImage->CompressedSize;
        }
        else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-        size = ((texImage->Width * texImage->TexFormat->TexelBytes + 63)
-                & ~63) * texImage->Height;
+        size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
+      }
+      else if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
+        /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+           though the actual offset may be different (if texture is less than
+           32 bytes width) to the untiled case */
+        int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+        size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
+        blitWidth = MAX2(texImage->Width, 64 / texelBytes);
        }
        else {
-         int w = texImage->Width * texImage->TexFormat->TexelBytes;
-         if (w < 32)
-            w = 32;
-         size = w * texImage->Height * texImage->Depth;
+        int w = (texImage->Width * texelBytes + 31) & ~31;
+        size = w * texImage->Height * texImage->Depth;
+        blitWidth = MAX2(texImage->Width, 64 / texelBytes);
        }
        assert(size > 0);
  
-
        /* Align to 32-byte offset.  It is faster to do this unconditionally
         * (no branch penalty).
         */
  
        curOffset = (curOffset + 0x1f) & ~0x1f;
  
-      t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
-      t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
-      t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
-      t->image[0][i].height = size / t->image[0][i].width;
+      if (texelBytes) {
+        t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
+        t->image[0][i].y = 0;
+        t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
+        t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
+      }
+      else {
+         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
+         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
+         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
+         t->image[0][i].height = size / t->image[0][i].width;     
+      }
  
  #if 0
        /* for debugging only and only  applicable to non-rectangle targets */
@@ -215,6 +291,22 @@ static void radeonSetTexImages( radeonContextPtr rmesa,
      */
     t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
  
+   /* Setup remaining cube face blits, if needed */
+   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+      const GLuint faceSize = t->base.totalSize;
+      GLuint face;
+      /* reuse face 0 x/y/width/height - just update the offset when uploading */
+      for (face = 1; face < 6; face++) {
+         for (i = 0; i < numLevels; i++) {
+            t->image[face][i].x =  t->image[0][i].x;
+            t->image[face][i].y =  t->image[0][i].y;
+            t->image[face][i].width  = t->image[0][i].width;
+            t->image[face][i].height = t->image[0][i].height;
+         }
+      }
+      t->base.totalSize = 6 * faceSize; /* total texmem needed */
+   }
+
     /* Hardware state:
      */
     t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
@@ -222,10 +314,27 @@ static void radeonSetTexImages( radeonContextPtr rmesa,
  
     t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
                        RADEON_TXFORMAT_HEIGHT_MASK |
-                       RADEON_TXFORMAT_CUBIC_MAP_ENABLE);
+                       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
+                       RADEON_TXFORMAT_F5_WIDTH_MASK |
+                       RADEON_TXFORMAT_F5_HEIGHT_MASK);
     t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
                       (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
  
+   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+      assert(log2Width == log2Height);
+      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
+                         (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
+                         (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
+      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
+   }
+
     t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
                     ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
  
@@ -233,11 +342,13 @@ static void radeonSetTexImages( radeonContextPtr rmesa,
      * requires 64-byte aligned pitches, and we may/may not need the
      * blitter.   NPOT only!
      */
-   if (baseImage->IsCompressed)
-      t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-   else
-      t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * baseImage->TexFormat->TexelBytes) + 63) & ~(63);
-   t->pp_txpitch -= 32;
+   if ( !t->image_override ) {
+      if (baseImage->IsCompressed)
+         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+      else
+         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
+      t->pp_txpitch -= 32;
+   }
  
     t->dirty_state = TEX_ALL;
  
@@ -415,9 +526,10 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
  
     /* Set the texture environment state.  Isn't this nice and clean?
      * The chip will automagically set the texture alpha to 0xff when
-    * the texture format does not include an alpha component.  This
+    * the texture format does not include an alpha component. This
      * reduces the amount of special-casing we have to do, alpha-only
-    * textures being a notable exception.
+    * textures being a notable exception. Doesn't work for luminance
+    * textures realized with I8 and ALPHA_IN_MAP not set neither (on r100).
      */
      /* Don't cache these results.
      */
@@ -447,7 +559,10 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
          assert(op <= 3);
          switch ( srcRGBi ) {
          case GL_TEXTURE:
-           color_arg[i] = radeon_texture_color[op][unit];
+           if (texUnit->_Current->Image[0][0]->_BaseFormat == GL_ALPHA)
+              color_arg[i] = radeon_zero_color[op];
+           else
+              color_arg[i] = radeon_texture_color[op][unit];
             break;
          case GL_CONSTANT:
             color_arg[i] = radeon_tfactor_color[op];
@@ -466,12 +581,17 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
             break;
          case GL_TEXTURE0:
          case GL_TEXTURE1:
-        case GL_TEXTURE2:
+        case GL_TEXTURE2: {
+           GLuint txunit = srcRGBi - GL_TEXTURE0;
+           if (ctx->Texture.Unit[txunit]._Current->Image[0][0]->_BaseFormat == GL_ALPHA)
+              color_arg[i] = radeon_zero_color[op];
+           else
          /* implement ogl 1.4/1.5 core spec here, not specification of
           * GL_ARB_texture_env_crossbar (which would require disabling blending
           * instead of undefined results when referencing not enabled texunit) */
-          color_arg[i] = radeon_texture_color[op][srcRGBi - GL_TEXTURE0];
-          break;
+             color_arg[i] = radeon_texture_color[op][txunit];
+           }
+           break;
          default:
             return GL_FALSE;
          }
@@ -484,7 +604,10 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
          assert(op <= 1);
          switch ( srcAi ) {
          case GL_TEXTURE:
-           alpha_arg[i] = radeon_texture_alpha[op][unit];
+           if (texUnit->_Current->Image[0][0]->_BaseFormat == GL_LUMINANCE)
+              alpha_arg[i] = radeon_zero_alpha[op+1];
+           else
+              alpha_arg[i] = radeon_texture_alpha[op][unit];
             break;
          case GL_CONSTANT:
             alpha_arg[i] = radeon_tfactor_alpha[op];
@@ -503,9 +626,14 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
             break;
          case GL_TEXTURE0:
          case GL_TEXTURE1:
-        case GL_TEXTURE2:
-          alpha_arg[i] = radeon_texture_alpha[op][srcAi - GL_TEXTURE0];
-          break;
+        case GL_TEXTURE2: {    
+           GLuint txunit = srcAi - GL_TEXTURE0;
+           if (ctx->Texture.Unit[txunit]._Current->Image[0][0]->_BaseFormat == GL_LUMINANCE)
+              alpha_arg[i] = radeon_zero_alpha[op+1];
+           else
+              alpha_arg[i] = radeon_texture_alpha[op][txunit];
+           }
+           break;
          default:
             return GL_FALSE;
          }
@@ -715,6 +843,44 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
     return GL_TRUE;
  }
  
+void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+                        unsigned long long offset, GLint depth, GLuint pitch)
+{
+       radeonContextPtr rmesa = pDRICtx->driverPrivate;
+       struct gl_texture_object *tObj =
+           _mesa_lookup_texture(rmesa->glCtx, texname);
+       radeonTexObjPtr t;
+
+       if (tObj == NULL)
+               return;
+
+       t = (radeonTexObjPtr) tObj->DriverData;
+
+       t->image_override = GL_TRUE;
+
+       if (!offset)
+               return;
+
+       t->pp_txoffset = offset;
+       t->pp_txpitch = pitch - 32;
+
+       switch (depth) {
+       case 32:
+               t->pp_txformat = tx_table[MESA_FORMAT_ARGB8888].format;
+               t->pp_txfilter |= tx_table[MESA_FORMAT_ARGB8888].filter;
+               break;
+       case 24:
+       default:
+               t->pp_txformat = tx_table[MESA_FORMAT_RGB888].format;
+               t->pp_txfilter |= tx_table[MESA_FORMAT_RGB888].filter;
+               break;
+       case 16:
+               t->pp_txformat = tx_table[MESA_FORMAT_RGB565].format;
+               t->pp_txfilter |= tx_table[MESA_FORMAT_RGB565].filter;
+               break;
+       }
+}
+
  #define TEXOBJ_TXFILTER_MASK (RADEON_MAX_MIP_LEVEL_MASK |      \
                               RADEON_MIN_FILTER_MASK |          \
                               RADEON_MAG_FILTER_MASK |          \
@@ -739,7 +905,11 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
                                   int unit,
                                   radeonTexObjPtr texobj )
  {
-   GLuint *cmd = RADEON_DB_STATE( tex[unit] );
+/* do not use RADEON_DB_STATE to avoid stale texture caches */
+   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+   GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+
+   RADEON_STATECHANGE( rmesa, tex[unit] );
  
     cmd[TEX_PP_TXFILTER] &= ~TEXOBJ_TXFILTER_MASK;
     cmd[TEX_PP_TXFILTER] |= texobj->pp_txfilter & TEXOBJ_TXFILTER_MASK;
@@ -747,13 +917,39 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
     cmd[TEX_PP_TXFORMAT] |= texobj->pp_txformat & TEXOBJ_TXFORMAT_MASK;
     cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset;
     cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
-   RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.tex[unit] );
  
     if (texobj->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
        GLuint *txr_cmd = RADEON_DB_STATE( txr[unit] );
        txr_cmd[TXR_PP_TEX_SIZE] = texobj->pp_txsize; /* NPOT only! */
        txr_cmd[TXR_PP_TEX_PITCH] = texobj->pp_txpitch; /* NPOT only! */
        RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.txr[unit] );
+      se_coord_fmt |= RADEON_VTX_ST0_NONPARAMETRIC << unit;
+   }
+   else {
+      se_coord_fmt &= ~(RADEON_VTX_ST0_NONPARAMETRIC << unit);
+
+      if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
+        int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
+        GLuint bytesPerFace = texobj->base.totalSize / 6;
+        ASSERT(texobj->base.totalSize % 6 == 0);
+
+        RADEON_STATECHANGE( rmesa, cube[unit] );
+        cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
+        /* dont know if this setup conforms to OpenGL.. 
+         * at least it matches the behavior of mesa software renderer
+         */
+        cube_cmd[CUBE_PP_CUBIC_OFFSET_0] = texobj->pp_txoffset; /* right */
+        cube_cmd[CUBE_PP_CUBIC_OFFSET_1] = texobj->pp_txoffset + 1 * bytesPerFace; /* left */
+        cube_cmd[CUBE_PP_CUBIC_OFFSET_2] = texobj->pp_txoffset + 2 * bytesPerFace; /* top */
+        cube_cmd[CUBE_PP_CUBIC_OFFSET_3] = texobj->pp_txoffset + 3 * bytesPerFace; /* bottom */
+        cube_cmd[CUBE_PP_CUBIC_OFFSET_4] = texobj->pp_txoffset + 4 * bytesPerFace; /* front */
+        cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset + 5 * bytesPerFace; /* back */
+      }
+   }
+
+   if (se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT]) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
     }
  
     texobj->dirty_state &= ~(1<<unit);
@@ -765,96 +961,135 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
  static void set_texgen_matrix( radeonContextPtr rmesa, 
                                GLuint unit,
                                const GLfloat *s_plane,
-                              const GLfloat *t_plane )
+                              const GLfloat *t_plane,
+                              const GLfloat *r_plane,
+                              const GLfloat *q_plane )
  {
-   static const GLfloat scale_identity[4] = { 1,1,1,1 };
-
-   if (!TEST_EQ_4V( s_plane, scale_identity) ||
-       !TEST_EQ_4V( t_plane, scale_identity)) {
-      rmesa->TexGenEnabled |= RADEON_TEXMAT_0_ENABLE<<unit;
-      rmesa->TexGenMatrix[unit].m[0]  = s_plane[0];
-      rmesa->TexGenMatrix[unit].m[4]  = s_plane[1];
-      rmesa->TexGenMatrix[unit].m[8]  = s_plane[2];
-      rmesa->TexGenMatrix[unit].m[12] = s_plane[3];
-
-      rmesa->TexGenMatrix[unit].m[1]  = t_plane[0];
-      rmesa->TexGenMatrix[unit].m[5]  = t_plane[1];
-      rmesa->TexGenMatrix[unit].m[9]  = t_plane[2];
-      rmesa->TexGenMatrix[unit].m[13] = t_plane[3];
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
-   }
+   rmesa->TexGenMatrix[unit].m[0]  = s_plane[0];
+   rmesa->TexGenMatrix[unit].m[4]  = s_plane[1];
+   rmesa->TexGenMatrix[unit].m[8]  = s_plane[2];
+   rmesa->TexGenMatrix[unit].m[12] = s_plane[3];
+
+   rmesa->TexGenMatrix[unit].m[1]  = t_plane[0];
+   rmesa->TexGenMatrix[unit].m[5]  = t_plane[1];
+   rmesa->TexGenMatrix[unit].m[9]  = t_plane[2];
+   rmesa->TexGenMatrix[unit].m[13] = t_plane[3];
+
+   rmesa->TexGenMatrix[unit].m[2]  = r_plane[0];
+   rmesa->TexGenMatrix[unit].m[6]  = r_plane[1];
+   rmesa->TexGenMatrix[unit].m[10] = r_plane[2];
+   rmesa->TexGenMatrix[unit].m[14] = r_plane[3];
+
+   rmesa->TexGenMatrix[unit].m[3]  = q_plane[0];
+   rmesa->TexGenMatrix[unit].m[7]  = q_plane[1];
+   rmesa->TexGenMatrix[unit].m[11] = q_plane[2];
+   rmesa->TexGenMatrix[unit].m[15] = q_plane[3];
+
+   rmesa->TexGenEnabled |= RADEON_TEXMAT_0_ENABLE << unit;
+   rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
  }
  
-/* Ignoring the Q texcoord for now.
- *
- * Returns GL_FALSE if fallback required.  
+/* Returns GL_FALSE if fallback required.
   */
  static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
-{  
+{
     radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
     struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
     GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
     GLuint tmp = rmesa->TexGenEnabled;
-
-   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
-   rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
-   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
+   static const GLfloat reflect[16] = {
+      -1,  0,  0,  0,
+       0, -1,  0,  0,
+       0,  0,  -1, 0,
+       0,  0,  0,  1 };
+
+   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE << unit);
+   rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE << unit);
+   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK << inputshift);
     rmesa->TexGenNeedNormals[unit] = 0;
  
-   if ((texUnit->TexGenEnabled & (S_BIT|T_BIT)) == 0) {
+   if ((texUnit->TexGenEnabled & (S_BIT|T_BIT|R_BIT|Q_BIT)) == 0) {
        /* Disabled, no fallback:
         */
-      rmesa->TexGenEnabled |= 
-        (RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
+      rmesa->TexGenEnabled |=
+        (RADEON_TEXGEN_INPUT_TEXCOORD_0 + unit) << inputshift;
        return GL_TRUE;
     }
-   else if (texUnit->TexGenEnabled & Q_BIT) {
-      /* Very easy to do this, in fact would remove a fallback case
-       * elsewhere, but I haven't done it yet...  Fallback: 
-       */
-      fprintf(stderr, "fallback Q_BIT\n");
-      return GL_FALSE;
+   /* the r100 cannot do texgen for some coords and not for others
+    * we do not detect such cases (certainly can't do it here) and just
+    * ASSUME that when S and T are texgen enabled we do not need other
+    * non-texgen enabled coords, no matter if the R and Q bits are texgen
+    * enabled. Still check for mixed mode texgen for all coords.
+    */
+   else if ( (texUnit->TexGenEnabled & S_BIT) &&
+            (texUnit->TexGenEnabled & T_BIT) &&
+            (texUnit->GenS.Mode == texUnit->GenT.Mode) ) {
+      if ( ((texUnit->TexGenEnabled & R_BIT) &&
+           (texUnit->GenS.Mode != texUnit->GenR.Mode)) ||
+          ((texUnit->TexGenEnabled & Q_BIT) &&
+           (texUnit->GenS.Mode != texUnit->GenQ.Mode)) ) {
+        /* Mixed modes, fallback:
+         */
+        if (RADEON_DEBUG & DEBUG_FALLBACKS)
+           fprintf(stderr, "fallback mixed texgen\n");
+        return GL_FALSE;
+      }
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_TEXMAT_0_ENABLE << unit;
     }
-   else if ((texUnit->TexGenEnabled & (S_BIT|T_BIT)) != (S_BIT|T_BIT) ||
-           texUnit->GenModeS != texUnit->GenModeT) {
-      /* Mixed modes, fallback:
-       */
-      /* fprintf(stderr, "fallback mixed texgen\n"); */
+   else {
+   /* some texgen mode not including both S and T bits */
+      if (RADEON_DEBUG & DEBUG_FALLBACKS)
+        fprintf(stderr, "fallback mixed texgen/nontexgen\n");
        return GL_FALSE;
     }
-   else
-      rmesa->TexGenEnabled |= RADEON_TEXGEN_TEXMAT_0_ENABLE << unit;
  
-   switch (texUnit->GenModeS) {
+   if ((texUnit->TexGenEnabled & (R_BIT | Q_BIT)) != 0) {
+      /* need this here for vtxfmt presumably. Argh we need to set
+         this from way too many places, would be much easier if we could leave
+         tcl q coord always enabled as on r200) */
+      RADEON_STATECHANGE( rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_Q_BIT(unit);
+   }
+
+   switch (texUnit->GenS.Mode) {
     case GL_OBJECT_LINEAR:
        rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_OBJ << inputshift;
-      set_texgen_matrix( rmesa, unit, 
-                        texUnit->ObjectPlaneS,
-                        texUnit->ObjectPlaneT);
+      set_texgen_matrix( rmesa, unit,
+                        texUnit->GenS.ObjectPlane,
+                        texUnit->GenT.ObjectPlane,
+                        texUnit->GenR.ObjectPlane,
+                        texUnit->GenQ.ObjectPlane);
        break;
  
     case GL_EYE_LINEAR:
        rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE << inputshift;
-      set_texgen_matrix( rmesa, unit, 
-                        texUnit->EyePlaneS,
-                        texUnit->EyePlaneT);
+      set_texgen_matrix( rmesa, unit,
+                        texUnit->GenS.EyePlane,
+                        texUnit->GenT.EyePlane,
+                        texUnit->GenR.EyePlane,
+                        texUnit->GenQ.EyePlane);
        break;
  
     case GL_REFLECTION_MAP_NV:
        rmesa->TexGenNeedNormals[unit] = GL_TRUE;
-      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_REFLECT<<inputshift;
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_REFLECT << inputshift;
+      /* TODO: unknown if this is needed/correct */
+      set_texgen_matrix( rmesa, unit, reflect, reflect + 4,
+                       reflect + 8, reflect + 12 );
        break;
  
     case GL_NORMAL_MAP_NV:
        rmesa->TexGenNeedNormals[unit] = GL_TRUE;
-      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_NORMAL<<inputshift;
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_NORMAL << inputshift;
        break;
  
     case GL_SPHERE_MAP:
+      /* the mode which everyone uses :-( */
     default:
        /* Unsupported mode, fallback:
         */
-      /*  fprintf(stderr, "fallback unsupported texgen\n"); */
+      if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+        fprintf(stderr, "fallback GL_SPHERE_MAP\n");
        return GL_FALSE;
     }
  
@@ -886,26 +1121,22 @@ static void disable_tex( GLcontext *ctx, int unit )
           ~((RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit);
  
        RADEON_STATECHANGE( rmesa, tcl );
-      switch (unit) {
-      case 0:
-        rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_TCL_VTX_ST0 |
-                                                  RADEON_TCL_VTX_Q0);
-           break;
-      case 1:
-        rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_TCL_VTX_ST1 |
-                                                  RADEON_TCL_VTX_Q1);
-        break;
-      default:
-        break;
-      }
-
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
+                                               RADEON_Q_BIT(unit));
  
        if (rmesa->TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
          TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
          rmesa->recheck_texgen[unit] = GL_TRUE;
        }
  
-
+      if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
+      /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
+         cubic_map bit on unit 2 when the unit is disabled, otherwise every
+        2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
+        units, better be safe than sorry though).*/
+        RADEON_STATECHANGE( rmesa, tex[unit] );
+        rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
+      }
  
        {
          GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
@@ -946,13 +1177,55 @@ static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
        RADEON_FIREVERTICES( rmesa );
        radeonSetTexImages( rmesa, tObj );
        radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock ) 
+      if ( !t->base.memBlock && !t->image_override ) 
         return GL_FALSE;
     }
  
     return GL_TRUE;
  }
  
+static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+   GLuint face;
+
+   /* Need to load the 2d images associated with this unit.
+    */
+   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
+      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
+      for (face = 0; face < 6; face++)
+         t->base.dirty_images[face] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+
+   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
+        t->base.dirty_images[2] || t->base.dirty_images[3] ||
+        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
+      /* flush */
+      RADEON_FIREVERTICES( rmesa );
+      /* layout memory space, once for all faces */
+      radeonSetTexImages( rmesa, tObj );
+   }
+
+   /* upload (per face) */
+   for (face = 0; face < 6; face++) {
+      if (t->base.dirty_images[face]) {
+         radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, face );
+      }
+   }
+      
+   if ( !t->base.memBlock ) {
+      /* texmem alloc failed, use s/w fallback */
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
  static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
  {
     radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
@@ -971,7 +1244,8 @@ static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
        RADEON_FIREVERTICES( rmesa );
        radeonSetTexImages( rmesa, tObj );
        radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock /* && !rmesa->prefer_gart_client_texturing  FIXME */ ) {
+      if ( !t->base.memBlock &&
+           !t->image_override /* && !rmesa->prefer_gart_client_texturing  FIXME */ ) {
          fprintf(stderr, "%s: upload failed\n", __FUNCTION__);
          return GL_FALSE;
        }
@@ -994,6 +1268,9 @@ static GLboolean update_tex_common( GLcontext *ctx, int unit )
        fprintf(stderr, "%s: border\n", __FUNCTION__);
        return GL_FALSE;
     }
+   /* yuv conversion only works in first unit */
+   if (unit != 0 && (t->pp_txfilter & RADEON_YUV_TO_RGB))
+      return GL_FALSE;
  
     /* Update state if this is a different texture object to last
      * time.
@@ -1024,16 +1301,15 @@ static GLboolean update_tex_common( GLcontext *ctx, int unit )
  
        RADEON_STATECHANGE( rmesa, tcl );
  
-      if (unit == 0)
-         rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_ST0;
-      else 
-         rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_ST1;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
  
        rmesa->recheck_texgen[unit] = GL_TRUE;
     }
  
     if (t->dirty_state & (1<<unit)) {
        import_tex_obj_state( rmesa, unit, t );
+      /* may need to update texture matrix (for texrect adjustments) */
+      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
     }
  
     if (rmesa->recheck_texgen[unit]) {
@@ -1043,7 +1319,7 @@ static GLboolean update_tex_common( GLcontext *ctx, int unit )
        rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
     }
  
-   format = tObj->Image[0][tObj->BaseLevel]->Format;
+   format = tObj->Image[0][tObj->BaseLevel]->_BaseFormat;
     if ( rmesa->state.texture.unit[unit].format != format ||
         rmesa->state.texture.unit[unit].envMode != texUnit->EnvMode ) {
        rmesa->state.texture.unit[unit].format = format;
@@ -1063,11 +1339,7 @@ static GLboolean radeonUpdateTextureUnit( GLcontext *ctx, int unit )
  {
     struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
  
-   TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_TEXRECT_0 << unit, 0 );
-
     if ( texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT) ) {
-      TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_TEXRECT_0 << unit, 1 );
-
        return (enable_tex_rect( ctx, unit ) &&
               update_tex_common( ctx, unit ));
     }
@@ -1075,6 +1347,10 @@ static GLboolean radeonUpdateTextureUnit( GLcontext *ctx, int unit )
        return (enable_tex_2d( ctx, unit ) &&
               update_tex_common( ctx, unit ));
     }
+   else if ( texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
+      return (enable_tex_cube( ctx, unit ) &&
+             update_tex_common( ctx, unit ));
+   }
     else if ( texUnit->_ReallyEnabled ) {
        return GL_FALSE;
     }
@@ -1090,7 +1366,8 @@ void radeonUpdateTextureState( GLcontext *ctx )
     GLboolean ok;
  
     ok = (radeonUpdateTextureUnit( ctx, 0 ) &&
-        radeonUpdateTextureUnit( ctx, 1 ));
+        radeonUpdateTextureUnit( ctx, 1 ) &&
+        radeonUpdateTextureUnit( ctx, 2 ));
  
     FALLBACK( rmesa, RADEON_FALLBACK_TEXTURE, !ok );