From 7a0099b9f3f4cbdb0893a3f11da84326dcf86179 Mon Sep 17 00:00:00 2001 From: Brian Date: Fri, 1 Feb 2008 13:45:58 -0700 Subject: [PATCH] Cell: implement Z16 and Z32 testing with SIMD instructions. --- src/mesa/pipe/cell/spu/spu_tile.h | 3 +- src/mesa/pipe/cell/spu/spu_tri.c | 222 ++++------------------------- src/mesa/pipe/cell/spu/spu_ztest.h | 135 ++++++++++++++++++ 3 files changed, 163 insertions(+), 197 deletions(-) create mode 100644 src/mesa/pipe/cell/spu/spu_ztest.h diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h index 1f123a2b7b4..4b1ef2a4c8d 100644 --- a/src/mesa/pipe/cell/spu/spu_tile.h +++ b/src/mesa/pipe/cell/spu/spu_tile.h @@ -42,7 +42,8 @@ typedef union { ushort t16[TILE_SIZE][TILE_SIZE]; uint t32[TILE_SIZE][TILE_SIZE]; - float4 f4[TILE_SIZE/2][TILE_SIZE/2]; + vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4]; + vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2]; } tile_t; diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c index a32878d9178..a26a4f098da 100644 --- a/src/mesa/pipe/cell/spu/spu_tri.c +++ b/src/mesa/pipe/cell/spu/spu_tri.c @@ -39,18 +39,11 @@ #include "spu_tile.h" #include "spu_tri.h" +#include "spu_ztest.h" -/* - * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions - * to do Z testing/updating. - */ -#define SIMD_Z 0 -#if SIMD_Z +/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */ typedef vector unsigned int mask_t; -#else -typedef uint mask_t; -#endif /** @@ -282,20 +275,11 @@ pack_colors(uint uicolors[4], const float4 fcolors[4]) } - -static unsigned int -do_depth_test(int x, int y, unsigned int mask) +static INLINE mask_t +do_depth_test(int x, int y, mask_t quadmask) { - static const float4 zscale16 - = {.f={65535.0, 65535.0, 65535.0, 65535.0}}; - static const float4 zscale32 - = {.f={(float)0xffffffff, - (float)0xffffffff, - (float)0xffffffff, - (float)0xffffffff}}; - int ix = x - setup.cliprect_minx; - int iy = y - setup.cliprect_miny; float4 zvals; + mask_t mask; zvals.v = eval_z((float) x, (float) y); @@ -305,129 +289,20 @@ do_depth_test(int x, int y, unsigned int mask) cur_tile_status_z = TILE_STATUS_DIRTY; } -#if 0 - if (cur_tile_status_z == TILE_STATUS_CLEAR) { - /* now, _really_ clear the tile */ - clear_z_tile(&ztile); - } - else if (cur_tile_status_z != TILE_STATUS_DIRTY) { - /* make sure we've got the tile from main mem */ - wait_on_mask(1 << TAG_READ_TILE_Z); - } - cur_tile_status_z = TILE_STATUS_DIRTY; -#endif - if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) { - zvals.v = spu_mul(zvals.v, zscale16.v); - if (mask & MASK_TOP_LEFT) { - uint z = (uint) zvals.f[0]; - if (z < ztile.t16[iy][ix]) - ztile.t16[iy][ix] = z; - else - mask &= ~MASK_TOP_LEFT; - } - - if (mask & MASK_TOP_RIGHT) { - uint z = (uint) zvals.f[1]; - if (z < ztile.t16[iy][ix+1]) - ztile.t16[iy][ix+1] = z; - else - mask &= ~MASK_TOP_RIGHT; - } - - if (mask & MASK_BOTTOM_LEFT) { - uint z = (uint) zvals.f[2]; - if (z < ztile.t16[iy+1][ix]) - ztile.t16[iy+1][ix] = z; - else - mask &= ~MASK_BOTTOM_LEFT; - } - - if (mask & MASK_BOTTOM_RIGHT) { - uint z = (uint) zvals.f[3]; - if (z < ztile.t16[iy+1][ix+1]) - ztile.t16[iy+1][ix+1] = z; - else - mask &= ~MASK_BOTTOM_RIGHT; - } + int ix = (x - setup.cliprect_minx) / 4; + int iy = (y - setup.cliprect_miny) / 2; + mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask); } else { - zvals.v = spu_mul(zvals.v, zscale32.v); - ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM); - if (mask & MASK_TOP_LEFT) { - uint z = (uint) zvals.f[0]; - if (z < ztile.t32[iy][ix]) - ztile.t32[iy][ix] = z; - else - mask &= ~MASK_TOP_LEFT; - } - - if (mask & MASK_TOP_RIGHT) { - uint z = (uint) zvals.f[1]; - if (z < ztile.t32[iy][ix+1]) - ztile.t32[iy][ix+1] = z; - else - mask &= ~MASK_TOP_RIGHT; - } - - if (mask & MASK_BOTTOM_LEFT) { - uint z = (uint) zvals.f[2]; - if (z < ztile.t32[iy+1][ix]) - ztile.t32[iy+1][ix] = z; - else - mask &= ~MASK_BOTTOM_LEFT; - } - - if (mask & MASK_BOTTOM_RIGHT) { - uint z = (uint) zvals.f[3]; - if (z < ztile.t32[iy+1][ix+1]) - ztile.t32[iy+1][ix+1] = z; - else - mask &= ~MASK_BOTTOM_RIGHT; - } + int ix = (x - setup.cliprect_minx) / 2; + int iy = (y - setup.cliprect_miny) / 2; + mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask); } - - if (mask) - cur_tile_status_z = TILE_STATUS_DIRTY; - return mask; } - - -static vector unsigned int -do_depth_test_simd(int x, int y, vector unsigned int quadmask) -{ - int ix = (x - setup.cliprect_minx) / 2; - int iy = (y - setup.cliprect_miny) / 2; - float4 zvals; - - vector unsigned int zmask; - - zvals.v = eval_z((float) x, (float) y); - - if (cur_tile_status_z == TILE_STATUS_CLEAR) { - /* now, _really_ clear the tile */ - clear_z_tile(&ztile); - } - else if (cur_tile_status_z != TILE_STATUS_DIRTY) { - /* make sure we've got the tile from main mem */ - wait_on_mask(1 << TAG_READ_TILE_Z); - } - cur_tile_status_z = TILE_STATUS_DIRTY; - - /* XXX fetch Z value sooner to hide latency here */ - zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v); - zmask = spu_and(zmask, quadmask); - - ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask); - //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4); - - return zmask; -} - - /** * Emit a quad (pass to next stage). No clipping is done. */ @@ -461,36 +336,18 @@ emit_quad( int x, int y, mask_t mask ) } if (spu.depth_stencil.depth.enabled) { -#if SIMD_Z - mask = do_depth_test_simd(x, y, mask); -#else mask = do_depth_test(x, y, mask); -#endif } -#if !SIMD_Z - if (mask) -#endif - { - if (cur_tile_status_c == TILE_STATUS_CLEAR) { - /* now, _really_ clear the tile */ - clear_c_tile(&ctile); - } + /* If any bits in mask are set... */ + if (spu_extract(spu_orx(mask), 0)) { -#if 0 if (cur_tile_status_c == TILE_STATUS_CLEAR) { /* now, _really_ clear the tile */ clear_c_tile(&ctile); - cur_tile_status_c = TILE_STATUS_DIRTY; } - else if (cur_tile_status_c != TILE_STATUS_DIRTY) { - /* make sure we've got the tile from main mem */ - wait_on_mask(1 << TAG_READ_TILE_COLOR); - } -#endif cur_tile_status_c = TILE_STATUS_DIRTY; -#if SIMD_Z if (spu_extract(mask, 0)) ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT]; if (spu_extract(mask, 1)) @@ -499,20 +356,11 @@ emit_quad( int x, int y, mask_t mask ) ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT]; if (spu_extract(mask, 3)) ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT]; -#elif 0 + +#if 0 /* SIMD_Z with swizzled color buffer (someday) */ vector float icolors = *((vector float *) &colors); ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask); - -#else - if (mask & MASK_TOP_LEFT) - ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT]; - if (mask & MASK_TOP_RIGHT) - ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT]; - if (mask & MASK_BOTTOM_LEFT) - ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT]; - if (mask & MASK_BOTTOM_RIGHT) - ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT]; #endif } @@ -533,38 +381,20 @@ static INLINE int block( int x ) /** * Compute mask which indicates which pixels in the 2x2 quad are actually inside * the triangle's bounds. - * - * this is pretty nasty... may need to rework flush_spans again to - * fix it, if possible. + * The mask is a uint4 vector and each element will be 0 or 0xffffffff. */ -static mask_t calculate_mask( int x ) +static INLINE mask_t calculate_mask( int x ) { -#if SIMD_Z - uint m0, m1, m2, m3; - - m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0; - m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0; - m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0; - m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0; - - return (vector unsigned int) {m0, m1, m2, m3}; -#else - unsigned mask = 0x0; - - if (x >= setup.span.left[0] && x < setup.span.right[0]) - mask |= MASK_TOP_LEFT; - - if (x >= setup.span.left[1] && x < setup.span.right[1]) - mask |= MASK_BOTTOM_LEFT; - - if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) - mask |= MASK_TOP_RIGHT; - - if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) - mask |= MASK_BOTTOM_RIGHT; - + /* This is a little tricky. + * Use & instead of && to avoid branches. + * Use negation to convert true/false to ~0/0 values. + */ + mask_t mask; + mask = spu_insert(-((x >= setup.span.left[0]) & (x < setup.span.right[0])), mask, 0); + mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1); + mask = spu_insert(-((x >= setup.span.left[1]) & (x < setup.span.right[1])), mask, 2); + mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3); return mask; -#endif } diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h new file mode 100644 index 00000000000..5fefb151765 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_ztest.h @@ -0,0 +1,135 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * Zbuffer/depth test code. + */ + + +#ifndef SPU_ZTEST_H +#define SPU_ZTEST_H + + +#ifdef __SPU__ +#include +#endif + + + +/** + * Perform Z testing for a 16-bit/value Z buffer. + * + * \param zvals vector of four fragment zvalues as floats + * \param zbuf ptr to vector of ushort[8] zbuffer values. Note that this + * contains the Z values for 2 quads, 8 pixels. + * \param x x coordinate of quad (only lsbit is significant) + * \param inMask indicates which fragments in the quad are alive + * \return new mask indicating which fragments are alive after ztest + */ +static INLINE vector unsigned int +spu_z16_test_less(vector float zvals, vector unsigned short *zbuf, + uint x, vector unsigned int inMask) +{ +#define ZERO 0x80 + vector unsigned int zvals_ui4, zbuf_ui4, mask; + + /* convert floats to uints in [0, 65535] */ + zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */ + zvals_ui4 = spu_rlmask(zvals_ui4, -16); /* right shift 16 */ + + /* XXX this conditional could be removed with a bit of work */ + if (x & 1) { + /* convert zbuffer values from ushorts to uints */ + /* gather lower four ushorts */ + zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf, + (vector unsigned int) *zbuf, + VEC_LITERAL(vector unsigned char, + ZERO, ZERO, 8, 9, ZERO, ZERO, 10, 11, + ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15)); + /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */ + mask = spu_cmpgt(zbuf_ui4, zvals_ui4); + /* mask &= inMask */ + mask = spu_and(mask, inMask); + /* zbuf = mask ? zval : zbuf */ + zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask); + /* convert zbuffer values from uints back to ushorts, preserve lower 4 */ + *zbuf = (vector unsigned short) + spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf, + VEC_LITERAL(vector unsigned char, + 16, 17, 18, 19, 20, 21, 22, 23, + 2, 3, 6, 7, 10, 11, 14, 15)); + } + else { + /* convert zbuffer values from ushorts to uints */ + /* gather upper four ushorts */ + zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf, + (vector unsigned int) *zbuf, + VEC_LITERAL(vector unsigned char, + ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3, + ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7)); + /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */ + mask = spu_cmpgt(zbuf_ui4, zvals_ui4); + /* mask &= inMask */ + mask = spu_and(mask, inMask); + /* zbuf = mask ? zval : zbuf */ + zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask); + /* convert zbuffer values from uints back to ushorts, preserve upper 4 */ + *zbuf = (vector unsigned short) + spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf, + VEC_LITERAL(vector unsigned char, + 2, 3, 6, 7, 10, 11, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31)); + } + return mask; +#undef ZERO +} + + +/** + * As above, but Zbuffer values as 32-bit uints + */ +static INLINE vector unsigned int +spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr, + vector unsigned int inMask) +{ + vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr; + + /* convert floats to uints in [0, 0xffffffff] */ + zvals_ui4 = spu_convtu(zvals, 32); + /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */ + mask = spu_cmpgt(zbuf, zvals_ui4); + /* mask &= inMask */ + mask = spu_and(mask, inMask); + /* zbuf = mask ? zval : zbuf */ + *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask); + + return mask; +} + + +#endif /* SPU_ZTEST_H */ -- 2.30.2