r300: Add proper support for sin/cos instruction in fragment program
authorRune Peterson <rune@megahurts.dk>
Sun, 11 Feb 2007 23:24:36 +0000 (00:24 +0100)
committerJerome Glisse <glisse@freedesktop.org>
Sun, 11 Feb 2007 23:24:36 +0000 (00:24 +0100)
Getting proper SIN and COS wasn't as easy as it appeared.
I had to make make some changes to the fragment program code.
general FP changes:
- support HHH swizzle for vector instructions.
- don't copy a source to a temp when it is not XYZW swizzled, but
 combine the two and have the swizzle resolve any issues.
 (saves temps/instructions with more elaborate shader code)
- fix overflow in cnstv[].

src/mesa/drivers/dri/r300/r300_context.h
src/mesa/drivers/dri/r300/r300_fragprog.c
src/mesa/drivers/dri/r300/r300_fragprog.h
src/mesa/drivers/dri/r300/r300_render.c
src/mesa/drivers/dri/r300/r300_state.c
src/mesa/drivers/dri/radeon/radeon_screen.c

index 02f8e9107d829aedff5e7d64b73e14d425c08823..b1402351591ab3ddc604350a220bd95085b72f9e 100644 (file)
@@ -729,6 +729,11 @@ struct r300_fragment_program {
        GLboolean params_uptodate;
 
        int max_temp_idx;
+
+       /* the index of the sin constant is stored here */
+       GLint const_sin;
+       
+       GLuint optimization;
 };
 
 #define R300_MAX_AOS_ARRAYS            16
index 6e85f0b5ddc83762565b751700dce17295106cdf..b00cf9ed33e5922706e45692265604c947c977c4 100644 (file)
@@ -33,7 +33,7 @@
 
 /*TODO'S
  *
- * - COS/SIN/SCS instructions
+ * - SCS instructions
  * - Depth write, WPOS/FOGC inputs
  * - FogOption
  * - Verify results of opcodes for accuracy, I've only checked them
@@ -187,6 +187,10 @@ static const struct {
 #define SLOT_VECTOR    (1<<0)
 #define SLOT_SCALAR    (1<<3)
 #define SLOT_BOTH      (SLOT_VECTOR | SLOT_SCALAR)
+
+/* mapping from SWIZZLE_* to r300 native values for scalar insns */
+#define SWIZZLE_HALF 6
+
 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
                                          SWIZZLE_##y, \
                                          SWIZZLE_##z, \
@@ -208,7 +212,7 @@ static const struct r300_pfs_swizzle {
        { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH },
        { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
        { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
-       { PFS_INVAL, R300_FPI0_ARGC_HALF, 0, 0},
+       { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
        { PFS_INVAL, 0, 0, 0},
 };
 
@@ -232,8 +236,6 @@ static const struct {
        { PFS_INVAL, PFS_INVAL, PFS_INVAL}
 };
 
-/* mapping from SWIZZLE_* to r300 native values for scalar insns */
-#define SWIZZLE_HALF 6
 static const struct {
        int base;       /* hw value of swizzle */
        int stride;     /* difference between SRC0/1/2 */
@@ -590,6 +592,7 @@ static GLuint do_swizzle(struct r300_fragment_program *rp,
        /* If swizzling from something without an XYZW native swizzle,
         * emit result to a temp, and do new swizzle from the temp.
         */
+#if 0
        if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
            REG_GET_SSWZ(src) != SWIZZLE_W) {
                GLuint temp = get_temp_reg(rp);
@@ -603,10 +606,30 @@ static GLuint do_swizzle(struct r300_fragment_program *rp,
                           0);
                src = temp;
        }
+#endif
 
-       /* set scalar swizzling */
-       REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+       if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
+           REG_GET_SSWZ(src) != SWIZZLE_W) {
+           GLuint vsrcswz = (v_swiz[REG_GET_VSWZ(src)].hash & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK)) | REG_GET_SSWZ(src) << 9;
+           GLint i;
 
+           GLuint newswz = 0;
+           GLuint offset;
+           for(i=0; i < 4; ++i){
+               offset = GET_SWZ(arbswz, i);
+               
+               newswz |= (offset <= 3)?GET_SWZ(vsrcswz, offset) << i*3:offset << i*3;
+           }
+
+           arbswz = newswz & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK);
+           REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
+       }
+       else
+       {
+           /* set scalar swizzling */
+           REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+
+       }
        do {
                vswz = REG_GET_VSWZ(src);
                do {
@@ -1234,62 +1257,87 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
                        break;
                case OPCODE_COS:
                        /*
-                        * cos using taylor serie:
-                        * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6!
+                        * cos using a parabola (see SIN):
+                        * cos(x):
+                        *   x += PI/2
+                        *   x = (x < PI)?x : x-2*PI
+                        *   result = sin(x)
                         */
                        temp = get_temp_reg(rp);
-                       cnstv[0] = 0.5;
-                       cnstv[1] = 0.041666667;
-                       cnstv[2] = 0.001388889;
-                       cnstv[4] = 0.0;
-                       cnst = emit_const4fv(rp, cnstv);
+                       if(rp->const_sin == -1){
+                           cnstv[0] = 1.273239545;
+                           cnstv[1] =-0.405284735;
+                           cnstv[2] = 3.141592654;
+                           cnstv[3] = 0.225;
+                           rp->const_sin = emit_const4fv(rp, cnstv);
+                       }
+                       cnst = rp->const_sin;                   
                        src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_XYZ,
-                                  src[0],
-                                  src[0],
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_Y | WRITEMASK_Z,
-                                  temp, temp,
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_Z,
-                                  temp,
-                                  swizzle(temp, X, X, X, W),
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_XYZ,
-                                  temp, cnst,
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_X,
-                                  pfs_one,
-                                  pfs_one,
-                                  negate(temp),
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_X,
-                                  temp,
-                                  pfs_one,
-                                  swizzle(temp, Y, Y, Y, W),
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_X,
-                                  temp,
-                                  pfs_one,
-                                  negate(swizzle(temp, Z, Z, Z, W)),
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, dest, mask,
+                       emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
+                                  pfs_half,
+                                  undef,
+                                  undef,
+                                  0);
+
+                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+                                  swizzle(cnst, Z, Z, Z, Z), //PI
+                                  pfs_half,
+                                  swizzle(keep(src[0]), X, X, X, X),
+                                  0);
+
+                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
+                                  negate(swizzle(temp, W, W, W, W)), //-2
+                                  swizzle(cnst, Z, Z, Z, Z), //PI
                                   swizzle(temp, X, X, X, X),
-                                  pfs_one,
+                                  0);
+
+                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
+                                  swizzle(cnst, Z, Z, Z, Z), //PI
+                                  negate(pfs_half),
+                                  swizzle(src[0], X, X, X, X),
+                                  0);
+                       
+                       emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
+                                  swizzle(temp, W, W, W, W),
+                                  swizzle(temp, X, X, X, X),
+                                  swizzle(temp, Y, Y, Y, Y), 
+                                  0);
+
+                       /* SIN */
+
+                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X | WRITEMASK_Y,
+                                  swizzle(temp, Z, Z, Z, Z),
+                                  cnst,
                                   pfs_zero,
-                                  flags);
+                                  0);
+
+                       if(rp->optimization == DRI_CONF_FP_OPTIMIZATION_SPEED){
+                           emit_arith(rp, PFS_OP_MAD, dest, mask,
+                                      swizzle(temp, Y, Y, Y, Y),
+                                      absolute(swizzle(temp, Z, Z, Z, Z)),
+                                      swizzle(temp, X, X, X, X),
+                                      flags);
+                       }else{
+                           emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+                                      swizzle(temp, Y, Y, Y, Y),
+                                      absolute(swizzle(temp, Z, Z, Z, Z)),
+                                      swizzle(temp, X, X, X, X),
+                                      0);
+                       
+                           emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
+                                      swizzle(temp, X, X, X, X),
+                                      absolute(swizzle(temp, X, X, X, X)),
+                                      negate(swizzle(temp, X, X, X, X)),
+                                      0);
+
+
+                           emit_arith(rp, PFS_OP_MAD, dest, mask,
+                                      swizzle(temp, Y, Y, Y, Y),
+                                      swizzle(cnst, W, W, W, W),
+                                      swizzle(temp, X, X, X, X),
+                                      flags);
+                       }
                        free_temp(rp, temp);
                        break;
                case OPCODE_DP3:
@@ -1398,7 +1446,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
                         * change the compare to (t.x + 0.5) > 0.5 we may
                         * save one instruction by doing CMP -t.x 
                         */
-                       cnstv[0] = cnstv[1] = cnstv[2] = cnstv[4] = 0.50001;
+                       cnstv[0] = cnstv[1] = cnstv[2] = cnstv[3] = 0.50001;
                        src[0] = t_src(rp, fpi->SrcReg[0]);
                        temp = get_temp_reg(rp);
                        cnst = emit_const4fv(rp, cnstv);
@@ -1548,68 +1596,55 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
                        break;
                case OPCODE_SIN:
                        /*
-                        * sin using taylor serie:
-                        * sin(x) = x - x^3/3! + x^5/5! - x^7/7!
+                        *  using a parabola:
+                        * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
+                        * extra precision is obtained by weighting against
+                        * itself squared.
                         */
+
                        temp = get_temp_reg(rp);
-                       cnstv[0] = 0.333333333;
-                       cnstv[1] = 0.008333333;
-                       cnstv[2] = 0.000198413;
-                       cnstv[4] = 0.0;
-                       cnst = emit_const4fv(rp, cnstv);
+                       if(rp->const_sin == -1){
+                           cnstv[0] = 1.273239545;
+                           cnstv[1] =-0.405284735;
+                           cnstv[2] = 3.141592654;
+                           cnstv[3] = 0.225;
+                           rp->const_sin = emit_const4fv(rp, cnstv);
+                       }
+                       cnst = rp->const_sin;
                        src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_XYZ,
-                                  src[0],
-                                  src[0],
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_Y | WRITEMASK_Z,
-                                  temp, temp,
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_Z,
-                                  temp,
-                                  swizzle(temp, X, X, X, W),
+                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X | WRITEMASK_Y,
+                                  swizzle(keep(src[0]), X, X, X, X),
+                                  cnst,
                                   pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_XYZ,
-                                  src[0],
-                                  temp,
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_XYZ,
-                                  temp, cnst,
-                                  pfs_zero,
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_X,
-                                  src[0],
-                                  pfs_one,
-                                  negate(temp),
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_X,
-                                  temp,
-                                  pfs_one,
-                                  swizzle(temp, Y, Y, Y, W),
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, temp,
-                                  WRITEMASK_X,
-                                  temp,
-                                  pfs_one,
-                                  negate(swizzle(temp, Z, Z, Z, W)),
-                                  flags);
-                       emit_arith(rp, PFS_OP_MAD, dest, mask,
-                                  swizzle(temp, X, X, X, X),
-                                  pfs_one,
-                                  pfs_zero,
-                                  flags);
+                                  0);
+
+                       if(rp->optimization == DRI_CONF_FP_OPTIMIZATION_SPEED){
+                           emit_arith(rp, PFS_OP_MAD, dest, mask,
+                                      swizzle(temp, Y, Y, Y, Y),
+                                      absolute(swizzle(src[0], X, X, X, X)),
+                                      swizzle(temp, X, X, X, X),
+                                      flags);
+                       }else{
+                           emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+                                      swizzle(temp, Y, Y, Y, Y),
+                                      absolute(swizzle(src[0], X, X, X, X)),
+                                      swizzle(temp, X, X, X, X),
+                                      0);
+                       
+                           emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
+                                      swizzle(temp, X, X, X, X),
+                                      absolute(swizzle(temp, X, X, X, X)),
+                                      negate(swizzle(temp, X, X, X, X)),
+                                      0);
+
+
+                           emit_arith(rp, PFS_OP_MAD, dest, mask,
+                                      swizzle(temp, Y, Y, Y, Y),
+                                      swizzle(cnst, W, W, W, W),
+                                      swizzle(temp, X, X, X, X),
+                                      flags);
+                       }
                        free_temp(rp, temp);
                        break;
                case OPCODE_SLT:
@@ -1681,7 +1716,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 /* - Init structures
  * - Determine what hwregs each input corresponds to
  */
-static void init_program(struct r300_fragment_program *rp)
+static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 {
        struct r300_pfs_compile_state *cs = NULL;
        struct gl_fragment_program *mp = &rp->mesa_program;     
@@ -1691,6 +1726,7 @@ static void init_program(struct r300_fragment_program *rp)
        int i,j;
 
        /* New compile, reset tracking data */
+       rp->optimization = driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
        rp->translated = GL_FALSE;
        rp->error      = GL_FALSE;
        rp->cs = cs        = &(R300_CONTEXT(rp->ctx)->state.pfs_compile);
@@ -1703,6 +1739,7 @@ static void init_program(struct r300_fragment_program *rp)
        rp->max_temp_idx = 0;
        rp->node[0].alu_end = -1;
        rp->node[0].tex_end = -1;
+       rp->const_sin = -1;
        
        _mesa_memset(cs, 0, sizeof(*rp->cs));
        for (i=0;i<PFS_MAX_ALU_INST;i++) {
@@ -1816,13 +1853,13 @@ static void update_params(struct r300_fragment_program *rp)
        rp->params_uptodate = GL_TRUE;
 }
 
-void r300_translate_fragment_shader(struct r300_fragment_program *rp)
+void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp)
 {
        struct r300_pfs_compile_state *cs = NULL;
 
        if (!rp->translated) {
                
-               init_program(rp);
+               init_program(r300, rp);
                cs = rp->cs;
 
                if (parse_program(rp) == GL_FALSE) {
index b0cebe60bb0d474a6a2cf151d546f06d7bb1a062..73986abc3cc611f099d7bb7b10816c9771c4fec8 100644 (file)
@@ -112,8 +112,11 @@ typedef struct r300_fragment_program_swizzle {
                ((0 | SRC_CONST) << R300_FPI3_SRC1A_SHIFT) | \
                ((0 | SRC_CONST) << R300_FPI3_SRC2A_SHIFT))
 
+#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
+
 struct r300_fragment_program;
 
-extern void r300_translate_fragment_shader(struct r300_fragment_program *rp);
+extern void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp);
 
 #endif
index e29df8769624e91cb393662bd6dbf251a7411895..211c451f661a9430f646b54233e74ffed87137df 100644 (file)
@@ -392,7 +392,7 @@ int r300Fallback(GLcontext *ctx)
 
        if (rp) {
                if (!rp->translated)
-                       r300_translate_fragment_shader(rp);
+                       r300_translate_fragment_shader(r300, rp);
 
                FALLBACK_IF(!rp->translated);
        }
index a12f3bb531342be88fb955e2597620736f6594a7..906dfceb482e4867a5ebf307fbf8f96781ee14e8 100644 (file)
@@ -1820,7 +1820,7 @@ void r300SetupPixelShader(r300ContextPtr rmesa)
        if (!rp)        /* should only happenen once, just after context is created */
                return;
        
-       r300_translate_fragment_shader(rp);
+       r300_translate_fragment_shader(rmesa, rp);
        if (!rp->translated) {
                fprintf(stderr, "%s: No valid fragment shader, exiting\n", __func__);
                return;
index cee1f7e2f9365bf19f52611c936f8f0ca0fc1b6e..fc5aa11462109fd6f4c920e24815dd8f4d9dacd3 100644 (file)
@@ -55,6 +55,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_span.h"
 #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
 #include "r300_context.h"
+#include "r300_fragprog.h"
 #include "radeon_span.h"
 #endif
 
@@ -168,6 +169,13 @@ DRI_CONF_OPT_BEGIN(disable_stencil_two_side,bool,def) \
         DRI_CONF_DESC(en,"Disable GL_EXT_stencil_two_side") \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_FP_OPTIMIZATION(def) \
+DRI_CONF_OPT_BEGIN_V(fp_optimization,enum,def,"0:1") \
+       DRI_CONF_DESC_BEGIN(en,"Fragment Program optimization") \
+                DRI_CONF_ENUM(0,"Optimize for Speed") \
+                DRI_CONF_ENUM(1,"Optimize for Quality") \
+        DRI_CONF_DESC_END \
+DRI_CONF_OPT_END
 
 const char __driConfigOptions[] =
 DRI_CONF_BEGIN
@@ -190,12 +198,13 @@ DRI_CONF_BEGIN
                DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER)
                DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC)
                DRI_CONF_DITHER_MODE(DRI_CONF_DITHER_XERRORDIFF)
+               DRI_CONF_FP_OPTIMIZATION(DRI_CONF_FP_OPTIMIZATION_SPEED)
        DRI_CONF_SECTION_END
        DRI_CONF_SECTION_DEBUG
                DRI_CONF_NO_RAST(false)
        DRI_CONF_SECTION_END
 DRI_CONF_END;
-static const GLuint __driNConfigOptions = 17;
+static const GLuint __driNConfigOptions = 18;
 
 #ifndef RADEON_DEBUG
 int RADEON_DEBUG = 0;