Merge commit 'origin/st-shader-varients'
[mesa.git] / src / mesa / drivers / dri / radeon / radeon_span.c
index a6ea8f021e48b510304a4fcd198d4a8f35335f39..37904dc8dc99debd11b9d694a06ec3adca326a3c 100644 (file)
@@ -41,6 +41,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include "main/glheader.h"
+#include "main/texformat.h"
 #include "swrast/swrast.h"
 
 #include "radeon_common.h"
@@ -51,96 +52,252 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
 
-static GLubyte *radeon_ptr32(const struct radeon_renderbuffer * rrb,
-                            GLint x, GLint y)
+
+/* r200 depth buffer is always tiled - this is the formula
+   according to the docs unless I typo'ed in it
+*/
+#if defined(RADEON_R200)
+static GLubyte *r200_depth_2byte(const struct radeon_renderbuffer * rrb,
+                                GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+       offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+       GLuint b;
+       offset = 0;
+       b = (((y  >> 4) * (rrb->pitch >> 8) + (x >> 6)));
+       offset += (b >> 1) << 12;
+       offset += (((rrb->pitch >> 8) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+       offset += ((y >> 2) & 0x3) << 9;
+       offset += ((x >> 3) & 0x1) << 8;
+       offset += ((x >> 4) & 0x3) << 6;
+       offset += ((x >> 2) & 0x1) << 5;
+       offset += ((y >> 1) & 0x1) << 4;
+       offset += ((x >> 1) & 0x1) << 3;
+       offset += (y & 0x1) << 2;
+       offset += (x & 0x1) << 1;
+    }
+    return &ptr[offset];
+}
+
+static GLubyte *r200_depth_4byte(const struct radeon_renderbuffer * rrb,
+                                GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+       offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+       GLuint b;
+       offset = 0;
+       b = (((y & 0x7ff) >> 4) * (rrb->pitch >> 7) + (x >> 5));
+       offset += (b >> 1) << 12;
+       offset += (((rrb->pitch >> 7) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+       offset += ((y >> 2) & 0x3) << 9;
+       offset += ((x >> 2) & 0x1) << 8;
+       offset += ((x >> 3) & 0x3) << 6;
+       offset += ((y >> 1) & 0x1) << 5;
+       offset += ((x >> 1) & 0x1) << 4;
+       offset += (y & 0x1) << 3;
+       offset += (x & 0x1) << 2;
+    }
+    return &ptr[offset];
+}
+#endif
+
+/* r600 tiling
+ * two main types:
+ * - 1D (akin to macro-linear/micro-tiled on older asics)
+ * - 2D (akin to macro-tiled/micro-tiled on older asics)
+ * only 1D tiling is implemented below
+ */
+#if defined(RADEON_R600)
+static inline GLint r600_1d_tile_helper(const struct radeon_renderbuffer * rrb,
+                                       GLint x, GLint y, GLint is_depth, GLint is_stencil)
+{
+    GLint element_bytes = rrb->cpp;
+    GLint num_samples = 1;
+    GLint tile_width = 8;
+    GLint tile_height = 8;
+    GLint tile_thickness = 1;
+    GLint pitch_elements = rrb->pitch / element_bytes;
+    GLint height = rrb->base.Height;
+    GLint z = 0;
+    GLint sample_number = 0;
+    /* */
+    GLint tile_bytes;
+    GLint tiles_per_row;
+    GLint tiles_per_slice;
+    GLint slice_offset;
+    GLint tile_row_index;
+    GLint tile_column_index;
+    GLint tile_offset;
+    GLint pixel_number = 0;
+    GLint element_offset;
+    GLint offset = 0;
+
+    tile_bytes = tile_width * tile_height * tile_thickness * element_bytes * num_samples;
+    tiles_per_row = pitch_elements / tile_width;
+    tiles_per_slice = tiles_per_row * (height / tile_height);
+    slice_offset = (z / tile_thickness) * tiles_per_slice * tile_bytes;
+    tile_row_index = y / tile_height;
+    tile_column_index = x / tile_width;
+    tile_offset = ((tile_row_index * tiles_per_row) + tile_column_index) * tile_bytes;
+
+    if (is_depth) {
+           GLint pixel_offset = 0;
+
+           pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+           pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+           pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+           pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+           pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+           pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+           switch (element_bytes) {
+           case 2:
+                   pixel_offset = pixel_number * element_bytes * num_samples;
+                   break;
+           case 4:
+                   /* stencil and depth data are stored separately within a tile.
+                    * stencil is stored in a contiguous tile before the depth tile.
+                    * stencil element is 1 byte, depth element is 3 bytes.
+                    * stencil tile is 64 bytes.
+                    */
+                   if (is_stencil)
+                           pixel_offset = pixel_number * 1 * num_samples;
+                   else
+                           pixel_offset = (pixel_number * 3 * num_samples) + 64;
+                   break;
+           }
+           element_offset = pixel_offset + (sample_number * element_bytes);
+    } else {
+           GLint sample_offset;
+
+           switch (element_bytes) {
+           case 1:
+                   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+                   pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1]
+                   pixel_number |= ((x >> 2) & 1) << 2; // pn[2] = x[2]
+                   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+                   pixel_number |= ((y >> 0) & 1) << 4; // pn[4] = y[0]
+                   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+                   break;
+           case 2:
+                   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+                   pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1]
+                   pixel_number |= ((x >> 2) & 1) << 2; // pn[2] = x[2]
+                   pixel_number |= ((y >> 0) & 1) << 3; // pn[3] = y[0]
+                   pixel_number |= ((y >> 1) & 1) << 4; // pn[4] = y[1]
+                   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+                   break;
+           case 4:
+                   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+                   pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1]
+                   pixel_number |= ((y >> 0) & 1) << 2; // pn[2] = y[0]
+                   pixel_number |= ((x >> 2) & 1) << 3; // pn[3] = x[2]
+                   pixel_number |= ((y >> 1) & 1) << 4; // pn[4] = y[1]
+                   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+                   break;
+           }
+           sample_offset = sample_number * (tile_bytes / num_samples);
+           element_offset = sample_offset + (pixel_number * element_bytes);
+    }
+    offset = slice_offset + tile_offset + element_offset;
+    return offset;
+}
+
+/* depth buffers */
+static GLubyte *r600_ptr_depth(const struct radeon_renderbuffer * rrb,
+                              GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset = r600_1d_tile_helper(rrb, x, y, 1, 0);
+    return &ptr[offset];
+}
+
+static GLubyte *r600_ptr_stencil(const struct radeon_renderbuffer * rrb,
+                                GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset = r600_1d_tile_helper(rrb, x, y, 1, 1);
+    return &ptr[offset];
+}
+
+static GLubyte *r600_ptr_color(const struct radeon_renderbuffer * rrb,
+                              GLint x, GLint y)
 {
     GLubyte *ptr = rrb->bo->ptr;
     uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
     GLint offset;
-    GLint nmacroblkpl;
-    GLint nmicroblkpl;
 
     if (rrb->has_surface || !(rrb->bo->flags & mask)) {
         offset = x * rrb->cpp + y * rrb->pitch;
     } else {
-        offset = 0;
-        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
-            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
-                nmacroblkpl = rrb->pitch >> 5;
-                offset += ((y >> 4) * nmacroblkpl) << 11;
-                offset += ((y & 15) >> 1) << 8;
-                offset += (y & 1) << 4;
-                offset += (x >> 5) << 11;
-                offset += ((x & 31) >> 2) << 5;
-                offset += (x & 3) << 2;
-            } else {
-                nmacroblkpl = rrb->pitch >> 6;
-                offset += ((y >> 3) * nmacroblkpl) << 11;
-                offset += (y & 7) << 8;
-                offset += (x >> 6) << 11;
-                offset += ((x & 63) >> 3) << 5;
-                offset += (x & 7) << 2;
-            }
-        } else {
-            nmicroblkpl = ((rrb->pitch + 31) & ~31) >> 5;
-            offset += (y * nmicroblkpl) << 5;
-            offset += (x >> 3) << 5;
-            offset += (x & 7) << 2;
-        }
+           offset = r600_1d_tile_helper(rrb, x, y, 0, 0);
     }
     return &ptr[offset];
 }
 
-static GLubyte *radeon_ptr16(const struct radeon_renderbuffer * rrb,
+#else
+
+/* radeon tiling on r300-r500 has 4 states,
+   macro-linear/micro-linear
+   macro-linear/micro-tiled
+   macro-tiled /micro-linear
+   macro-tiled /micro-tiled
+   1 byte surface 
+   2 byte surface - two types - we only provide 8x2 microtiling
+   4 byte surface
+   8/16 byte (unused)
+*/
+static GLubyte *radeon_ptr_4byte(const struct radeon_renderbuffer * rrb,
                             GLint x, GLint y)
 {
     GLubyte *ptr = rrb->bo->ptr;
     uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
     GLint offset;
-    GLint nmacroblkpl;
-    GLint nmicroblkpl;
 
     if (rrb->has_surface || !(rrb->bo->flags & mask)) {
         offset = x * rrb->cpp + y * rrb->pitch;
     } else {
         offset = 0;
         if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
-            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
-                nmacroblkpl = rrb->pitch >> 6;
-                offset += ((y >> 4) * nmacroblkpl) << 11;
-                offset += ((y & 15) >> 1) << 8;
-                offset += (y & 1) << 4;
-                offset += (x >> 6) << 11;
-                offset += ((x & 63) >> 3) << 5;
-                offset += (x & 7) << 1;
+           if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
+               offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 5)) << 11;
+               offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 10;
+               offset += (((y >> 4) ^ (x >> 4)) & 0x1) << 9;
+               offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 8;
+               offset += (((y >> 3) ^ (x >> 3)) & 0x1) << 7;
+               offset += ((y >> 1) & 0x1) << 6;
+               offset += ((x >> 2) & 0x1) << 5;
+               offset += (y & 1) << 4;
+               offset += (x & 3) << 2;
             } else {
-                nmacroblkpl = rrb->pitch >> 7;
-                offset += ((y >> 3) * nmacroblkpl) << 11;
-                offset += (y & 7) << 8;
-                offset += (x >> 7) << 11;
-                offset += ((x & 127) >> 4) << 5;
-                offset += (x & 15) << 2;
+               offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 6)) << 11;
+               offset += (((y >> 2) ^ (x >> 6)) & 0x1) << 10;
+               offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 9;
+               offset += (((y >> 1) ^ (x >> 5)) & 0x1) << 8;
+               offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 7;
+               offset += (y & 1) << 6;
+               offset += (x & 15) << 2;
             }
         } else {
-            nmicroblkpl = ((rrb->pitch + 31) & ~31) >> 5;
-            offset += (y * nmicroblkpl) << 5;
-            offset += (x >> 4) << 5;
-            offset += (x & 15) << 2;
+           offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 2)) << 5;
+           offset += (y & 1) << 4;
+           offset += (x & 3) << 2;
         }
     }
     return &ptr[offset];
 }
 
-static GLubyte *radeon_ptr(const struct radeon_renderbuffer * rrb,
-                          GLint x, GLint y)
+static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
+                                    GLint x, GLint y)
 {
     GLubyte *ptr = rrb->bo->ptr;
     uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
     GLint offset;
-    GLint microblkxs;
-    GLint macroblkxs;
-    GLint nmacroblkpl;
-    GLint nmicroblkpl;
 
     if (rrb->has_surface || !(rrb->bo->flags & mask)) {
         offset = x * rrb->cpp + y * rrb->pitch;
@@ -148,48 +305,34 @@ static GLubyte *radeon_ptr(const struct radeon_renderbuffer * rrb,
         offset = 0;
         if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
             if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
-                microblkxs = 16 / rrb->cpp;
-                macroblkxs = 128 / rrb->cpp;
-                nmacroblkpl = rrb->pitch / macroblkxs;
-                offset += ((y >> 4) * nmacroblkpl) << 11;
-                offset += ((y & 15) >> 1) << 8;
-                offset += (y & 1) << 4;
-                offset += (x / macroblkxs) << 11;
-                offset += ((x & (macroblkxs - 1)) / microblkxs) << 5;
-                offset += (x & (microblkxs - 1)) * rrb->cpp;
+               offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 6)) << 11;
+               offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 10;
+               offset += (((y >> 4) ^ (x >> 5)) & 0x1) << 9;
+               offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 8;
+               offset += (((y >> 3) ^ (x >> 4)) & 0x1) << 7;
+               offset += ((y >> 1) & 0x1) << 6;
+               offset += ((x >> 3) & 0x1) << 5;
+               offset += (y & 1) << 4;
+               offset += (x & 3) << 2;
             } else {
-                microblkxs = 32 / rrb->cpp;
-                macroblkxs = 256 / rrb->cpp;
-                nmacroblkpl = rrb->pitch / macroblkxs;
-                offset += ((y >> 3) * nmacroblkpl) << 11;
-                offset += (y & 7) << 8;
-                offset += (x / macroblkxs) << 11;
-                offset += ((x & (macroblkxs - 1)) / microblkxs) << 5;
-                offset += (x & (microblkxs - 1)) * rrb->cpp;
+               offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 7)) << 11;
+               offset += (((y >> 2) ^ (x >> 7)) & 0x1) << 10;
+               offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 9;
+               offset += (((y >> 1) ^ (x >> 6)) & 0x1) << 8;
+               offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 7;
+               offset += (y & 1) << 6;
+               offset += ((x >> 4) & 0x1) << 5;
+                offset += (x & 15) << 2;
             }
         } else {
-            microblkxs = 32 / rrb->cpp;
-            nmicroblkpl = ((rrb->pitch + 31) & ~31) >> 5;
-            offset += (y * nmicroblkpl) << 5;
-            offset += (x / microblkxs) << 5;
-            offset += (x & (microblkxs - 1)) * rrb->cpp;
+           offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 3)) << 5;
+           offset += (y & 0x1) << 4;
+           offset += (x & 0x7) << 1;
         }
     }
     return &ptr[offset];
 }
 
-#ifndef COMPILE_R300
-static uint32_t
-z24s8_to_s8z24(uint32_t val)
-{
-   return (val << 24) | (val >> 8);
-}
-
-static uint32_t
-s8z24_to_z24s8(uint32_t val)
-{
-   return (val >> 24) | (val << 8);
-}
 #endif
 
 /*
@@ -251,7 +394,23 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_RGB565
 #define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) radeon_ptr16(rrb, (X) + x_off, (Y) + y_off)
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5_REV
+
+#define TAG(x)    radeon##x##_RGB565_REV
+#define TAG2(x,y) radeon##x##_RGB565_REV##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
 #include "spantmp2.h"
 
 /* 16 bit, ARGB1555 color spanline and pixel functions
@@ -261,7 +420,23 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_ARGB1555
 #define TAG2(x,y) radeon##x##_ARGB1555##y
-#define GET_PTR(X,Y) radeon_ptr16(rrb, (X) + x_off, (Y) + y_off)
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5
+
+#define TAG(x)    radeon##x##_ARGB1555_REV
+#define TAG2(x,y) radeon##x##_ARGB1555_REV##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
 #include "spantmp2.h"
 
 /* 16 bit, RGBA4 color spanline and pixel functions
@@ -271,7 +446,23 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_ARGB4444
 #define TAG2(x,y) radeon##x##_ARGB4444##y
-#define GET_PTR(X,Y) radeon_ptr16(rrb, (X) + x_off, (Y) + y_off)
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4
+
+#define TAG(x)    radeon##x##_ARGB4444_REV
+#define TAG2(x,y) radeon##x##_ARGB4444_REV##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
 #include "spantmp2.h"
 
 /* 32 bit, xRGB8888 color spanline and pixel functions
@@ -281,11 +472,19 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_xRGB8888
 #define TAG2(x,y) radeon##x##_xRGB8888##y
-#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr32(rrb, _x + x_off, _y + y_off)) | 0xff000000))
+#if defined(RADEON_R600)
+#define GET_VALUE(_x, _y) ((*(GLuint*)(r600_ptr_color(rrb, _x + x_off, _y + y_off)) | 0xff000000))
 #define PUT_VALUE(_x, _y, d) { \
-   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x + x_off, _y + y_off );                \
+   GLuint *_ptr = (GLuint*)r600_ptr_color( rrb, _x + x_off, _y + y_off );              \
    *_ptr = d;                                                          \
 } while (0)
+#else
+#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) | 0xff000000))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );            \
+   *_ptr = d;                                                          \
+} while (0)
+#endif
 #include "spantmp2.h"
 
 /* 32 bit, ARGB8888 color spanline and pixel functions
@@ -295,7 +494,55 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_ARGB8888
 #define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) radeon_ptr32(rrb, (X) + x_off, (Y) + y_off)
+#if defined(RADEON_R600)
+#define GET_VALUE(_x, _y) (*(GLuint*)(r600_ptr_color(rrb, _x + x_off, _y + y_off)))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)r600_ptr_color( rrb, _x + x_off, _y + y_off );              \
+   *_ptr = d;                                                          \
+} while (0)
+#else
+#define GET_VALUE(_x, _y) (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );            \
+   *_ptr = d;                                                          \
+} while (0)
+#endif
+#include "spantmp2.h"
+
+/* 32 bit, BGRx8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8
+
+#define TAG(x)    radeon##x##_BGRx8888
+#define TAG2(x,y) radeon##x##_BGRx8888##y
+#if defined(RADEON_R600)
+#define GET_VALUE(_x, _y) ((*(GLuint*)(r600_ptr_color(rrb, _x + x_off, _y + y_off)) | 0x000000ff))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)r600_ptr_color( rrb, _x + x_off, _y + y_off );              \
+   *_ptr = d;                                                          \
+} while (0)
+#else
+#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) | 0x000000ff))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );            \
+   *_ptr = d;                                                          \
+} while (0)
+#endif
+#include "spantmp2.h"
+
+/* 32 bit, BGRA8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8
+
+#define TAG(x)    radeon##x##_BGRA8888
+#define TAG2(x,y) radeon##x##_BGRA8888##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_4byte(rrb, (X) + x_off, (Y) + y_off)
+#endif
 #include "spantmp2.h"
 
 /* ================================================================
@@ -316,11 +563,27 @@ s8z24_to_z24s8(uint32_t val)
  */
 #define VALUE_TYPE GLushort
 
+#if defined(RADEON_R200)
+#define WRITE_DEPTH( _x, _y, d )                                       \
+   *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off) = d
+#elif defined(RADEON_R600)
 #define WRITE_DEPTH( _x, _y, d )                                       \
-   *(GLushort *)radeon_ptr(rrb, _x + x_off, _y + y_off) = d
+   *(GLushort *)r600_ptr_depth(rrb, _x + x_off, _y + y_off) = d
+#else
+#define WRITE_DEPTH( _x, _y, d )                                       \
+   *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off) = d
+#endif
 
+#if defined(RADEON_R200)
 #define READ_DEPTH( d, _x, _y )                                                \
-   d = *(GLushort *)radeon_ptr(rrb, _x + x_off, _y + y_off)
+   d = *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off)
+#elif defined(RADEON_R600)
+#define READ_DEPTH( d, _x, _y )                                                \
+   d = *(GLushort *)r600_ptr_depth(rrb, _x + x_off, _y + y_off)
+#else
+#define READ_DEPTH( d, _x, _y )                                                \
+   d = *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off)
+#endif
 
 #define TAG(x) radeon##x##_z16
 #include "depthtmp.h"
@@ -332,39 +595,64 @@ s8z24_to_z24s8(uint32_t val)
  */
 #define VALUE_TYPE GLuint
 
-#ifdef COMPILE_R300
+#if defined(RADEON_R300)
 #define WRITE_DEPTH( _x, _y, d )                                       \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x + x_off, _y + y_off );                \
-   GLuint tmp = *_ptr;                         \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );            \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x000000ff;                                                  \
    tmp |= ((d << 8) & 0xffffff00);                                     \
+   *_ptr = CPU_TO_LE32(tmp);                                            \
+} while (0)
+#elif defined(RADEON_R600)
+#define WRITE_DEPTH( _x, _y, d )                                       \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)r600_ptr_depth( rrb, _x + x_off, _y + y_off );              \
+   GLuint tmp = *_ptr;                         \
+   tmp &= 0xff000000;                                                  \
+   tmp |= ((d) & 0x00ffffff);                                  \
    *_ptr = tmp;                                        \
 } while (0)
+#elif defined(RADEON_R200)
+#define WRITE_DEPTH( _x, _y, d )                                       \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );            \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
+   tmp &= 0xff000000;                                                  \
+   tmp |= ((d) & 0x00ffffff);                                          \
+   *_ptr = CPU_TO_LE32(tmp);                                            \
+} while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )                                       \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x + x_off, _y + y_off );        \
-   GLuint tmp = *_ptr;                                                 \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );    \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0xff000000;                                                  \
    tmp |= ((d) & 0x00ffffff);                                          \
-   *_ptr = tmp;                                        \
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#if defined(RADEON_R300)
 #define READ_DEPTH( d, _x, _y )                                                \
   do {                                                                 \
-    d = (*(GLuint*)(radeon_ptr32(rrb, _x + x_off, _y + y_off)) & 0xffffff00) >> 8; \
+    d = (LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off))) & 0xffffff00) >> 8; \
+  }while(0)
+#elif defined(RADEON_R600)
+#define READ_DEPTH( d, _x, _y )                                                \
+  do {                                                                 \
+    d = (*(GLuint*)(r600_ptr_depth(rrb, _x + x_off, _y + y_off)) & 0x00ffffff); \
+  }while(0)
+#elif defined(RADEON_R200)
+#define READ_DEPTH( d, _x, _y )                                                \
+  do {                                                                 \
+    d = LE32_TO_CPU(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off))) & 0x00ffffff; \
   }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )        \
-  d = *(GLuint*)(radeon_ptr32(rrb, _x + x_off, _y + y_off)) & 0x00ffffff;
+  d = LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off))) & 0x00ffffff;
 #endif
-/*
-    fprintf(stderr, "dval(%d, %d, %d, %d)=0x%08X\n", _x, xo, _y, yo, d);\
-   d = *(GLuint*)(radeon_ptr(rrb, _x,  _y )) & 0x00ffffff;
-*/
+
 #define TAG(x) radeon##x##_z24
 #include "depthtmp.h"
 
@@ -376,36 +664,64 @@ do {                                                                      \
  */
 #define VALUE_TYPE GLuint
 
-#ifdef COMPILE_R300
+#if defined(RADEON_R300)
 #define WRITE_DEPTH( _x, _y, d )                                       \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x + x_off, _y + y_off );                \
-   *_ptr = d;                                                          \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );            \
+   *_ptr = CPU_TO_LE32((((d) & 0xff000000) >> 24) | (((d) & 0x00ffffff) << 8));   \
 } while (0)
-#else
+#elif defined(RADEON_R600)
 #define WRITE_DEPTH( _x, _y, d )                                       \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x + x_off, _y + y_off );        \
-   GLuint tmp = z24s8_to_s8z24(d);                                     \
+   GLuint *_ptr = (GLuint*)r600_ptr_depth( rrb, _x + x_off, _y + y_off );              \
+   GLuint tmp = *_ptr;                         \
+   tmp &= 0xff000000;                                                  \
+   tmp |= ((d) & 0x00ffffff);                                  \
    *_ptr = tmp;                                        \
+   _ptr = (GLuint*)r600_ptr_stencil(rrb, _x + x_off, _y + y_off);              \
+   tmp = *_ptr;                                \
+   tmp &= 0xffffff00;                                                  \
+   tmp |= ((d) >> 24) & 0xff;                                          \
+   *_ptr = tmp;                                        \
+} while (0)
+#elif defined(RADEON_R200)
+#define WRITE_DEPTH( _x, _y, d )                                       \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );            \
+   *_ptr = CPU_TO_LE32(d);                                             \
+} while (0)
+#else
+#define WRITE_DEPTH( _x, _y, d )                                       \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );    \
+   *_ptr = CPU_TO_LE32(d);                                             \
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#if defined(RADEON_R300)
+#define READ_DEPTH( d, _x, _y )                                                \
+  do { \
+    GLuint tmp = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));  \
+    d = LE32_TO_CPU(((tmp & 0x000000ff) << 24) | ((tmp & 0xffffff00) >> 8));   \
+  }while(0)
+#elif defined(RADEON_R600)
 #define READ_DEPTH( d, _x, _y )                                                \
   do { \
-    d = (*(GLuint*)(radeon_ptr32(rrb, _x + x_off, _y + y_off)));       \
+    d = (*(GLuint*)(r600_ptr_depth(rrb, _x + x_off, _y + y_off))) & 0x00ffffff; \
+    d |= ((*(GLuint*)(r600_ptr_stencil(rrb, _x + x_off, _y + y_off))) << 24) & 0xff000000; \
+  }while(0)
+#elif defined(RADEON_R200)
+#define READ_DEPTH( d, _x, _y )                                                \
+  do { \
+    d = LE32_TO_CPU(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off))); \
   }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )        do {                                    \
-    d = s8z24_to_z24s8(*(GLuint*)(radeon_ptr32(rrb, _x + x_off,        _y + y_off ))); \
+    d = LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off))); \
   } while (0)
 #endif
-/*
-    fprintf(stderr, "dval(%d, %d, %d, %d)=0x%08X\n", _x, xo, _y, yo, d);\
-   d = *(GLuint*)(radeon_ptr(rrb, _x,  _y )) & 0x00ffffff;
-*/
-#define TAG(x) radeon##x##_z24_s8
+
+#define TAG(x) radeon##x##_s8_z24
 #include "depthtmp.h"
 
 /* ================================================================
@@ -414,43 +730,75 @@ do {                                                                      \
 
 /* 24 bit depth, 8 bit stencil depthbuffer functions
  */
-#ifdef COMPILE_R300
+#ifdef RADEON_R300
+#define WRITE_STENCIL( _x, _y, d )                                     \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);              \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
+   tmp &= 0xffffff00;                                                  \
+   tmp |= (d) & 0xff;                                                  \
+   *_ptr = CPU_TO_LE32(tmp);                                            \
+} while (0)
+#elif defined(RADEON_R600)
 #define WRITE_STENCIL( _x, _y, d )                                     \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32(rrb, _x + x_off, _y + y_off);          \
+   GLuint *_ptr = (GLuint*)r600_ptr_stencil(rrb, _x + x_off, _y + y_off);              \
    GLuint tmp = *_ptr;                         \
    tmp &= 0xffffff00;                                                  \
    tmp |= (d) & 0xff;                                                  \
    *_ptr = tmp;                                        \
 } while (0)
+#elif defined(RADEON_R200)
+#define WRITE_STENCIL( _x, _y, d )                                     \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)r200_depth_4byte(rrb, _x + x_off, _y + y_off);              \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
+   tmp &= 0x00ffffff;                                                  \
+   tmp |= (((d) & 0xff) << 24);                                                \
+   *_ptr = CPU_TO_LE32(tmp);                                            \
+} while (0)
 #else
 #define WRITE_STENCIL( _x, _y, d )                                     \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32(rrb, _x + x_off, _y + y_off);          \
-   GLuint tmp = *_ptr;                         \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);              \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x00ffffff;                                                  \
    tmp |= (((d) & 0xff) << 24);                                                \
-   *_ptr = tmp;                                        \
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#ifdef RADEON_R300
+#define READ_STENCIL( d, _x, _y )                                      \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );            \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
+   d = tmp & 0x000000ff;                                               \
+} while (0)
+#elif defined(RADEON_R600)
 #define READ_STENCIL( d, _x, _y )                                      \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x + x_off, _y + y_off );                \
+   GLuint *_ptr = (GLuint*)r600_ptr_stencil( rrb, _x + x_off, _y + y_off );            \
    GLuint tmp = *_ptr;                         \
    d = tmp & 0x000000ff;                                               \
 } while (0)
+#elif defined(RADEON_R200)
+#define READ_STENCIL( d, _x, _y )                                      \
+do {                                                                   \
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );            \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
+   d = (tmp & 0xff000000) >> 24;                                       \
+} while (0)
 #else
 #define READ_STENCIL( d, _x, _y )                                      \
 do {                                                                   \
-   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x + x_off, _y + y_off );                \
-   GLuint tmp = *_ptr;                         \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );            \
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    d = (tmp & 0xff000000) >> 24;                                       \
 } while (0)
 #endif
 
-#define TAG(x) radeon##x##_z24_s8
+#define TAG(x) radeon##x##_s8_z24
 #include "stenciltmp.h"
 
 
@@ -572,25 +920,35 @@ void radeonInitSpanFuncs(GLcontext * ctx)
  */
 static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
 {
-       if (rrb->base._ActualFormat == GL_RGB5) {
+       if (rrb->base.Format == MESA_FORMAT_RGB565) {
                radeonInitPointers_RGB565(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_RGB8) {
+       } else if (rrb->base.Format == MESA_FORMAT_RGB565_REV) {
+               radeonInitPointers_RGB565_REV(&rrb->base);
+       } else if (rrb->base.Format == MESA_FORMAT_XRGB8888) {
                radeonInitPointers_xRGB8888(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_RGBA8) {
+        } else if (rrb->base.Format == MESA_FORMAT_XRGB8888_REV) {
+               radeonInitPointers_BGRx8888(&rrb->base);
+       } else if (rrb->base.Format == MESA_FORMAT_ARGB8888) {
                radeonInitPointers_ARGB8888(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_RGBA4) {
+        } else if (rrb->base.Format == MESA_FORMAT_ARGB8888_REV) {
+               radeonInitPointers_BGRA8888(&rrb->base);
+       } else if (rrb->base.Format == MESA_FORMAT_ARGB4444) {
                radeonInitPointers_ARGB4444(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_RGB5_A1) {
+       } else if (rrb->base.Format == MESA_FORMAT_ARGB4444_REV) {
+               radeonInitPointers_ARGB4444_REV(&rrb->base);
+       } else if (rrb->base.Format == MESA_FORMAT_ARGB1555) {
                radeonInitPointers_ARGB1555(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT16) {
+       } else if (rrb->base.Format == MESA_FORMAT_ARGB1555_REV) {
+               radeonInitPointers_ARGB1555_REV(&rrb->base);
+       } else if (rrb->base.Format == MESA_FORMAT_Z16) {
                radeonInitDepthPointers_z16(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT24) {
+       } else if (rrb->base.Format == MESA_FORMAT_X8_Z24) {
                radeonInitDepthPointers_z24(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
-               radeonInitDepthPointers_z24_s8(&rrb->base);
-       } else if (rrb->base._ActualFormat == GL_STENCIL_INDEX8_EXT) {
-               radeonInitStencilPointers_z24_s8(&rrb->base);
+       } else if (rrb->base.Format == MESA_FORMAT_S8_Z24) {
+               radeonInitDepthPointers_s8_z24(&rrb->base);
+       } else if (rrb->base.Format == MESA_FORMAT_S8) {
+               radeonInitStencilPointers_s8_z24(&rrb->base);
        } else {
-               fprintf(stderr, "radeonSetSpanFunctions: bad actual format: 0x%04X\n", rrb->base._ActualFormat);
+               fprintf(stderr, "radeonSetSpanFunctions: bad format: 0x%04X\n", rrb->base.Format);
        }
 }