r300: Allow adding parameters during fragprog transform, share LIT code
authorNicolai Haehnle <nhaehnle@gmail.com>
Sat, 5 Jul 2008 20:21:24 +0000 (22:21 +0200)
committerNicolai Haehnle <nhaehnle@gmail.com>
Sun, 6 Jul 2008 07:59:43 +0000 (09:59 +0200)
src/mesa/drivers/dri/r300/r300_context.h
src/mesa/drivers/dri/r300/r300_fragprog.c
src/mesa/drivers/dri/r300/r300_fragprog_emit.c
src/mesa/drivers/dri/r300/r300_state.c
src/mesa/drivers/dri/r300/r500_fragprog.c
src/mesa/drivers/dri/r300/r500_fragprog_emit.c
src/mesa/drivers/dri/r300/radeon_program_alu.c

index a24ab0cad76aa33735d5089fed3bffc52640204f..a69beba9a7b65b987acd5640de81536223d3ec70 100644 (file)
@@ -716,14 +716,11 @@ struct r300_fragment_program_code {
        int tex_offset;
        int tex_end;
 
-       /* Hardware constants.
-        * Contains a pointer to the value. The destination of the pointer
-        * is supposed to be updated when GL state changes.
-        * Typically, this is either a pointer into
-        * gl_program_parameter_list::ParameterValues, or a pointer to a
-        * global constant (e.g. for sin/cos-approximation)
+       /**
+        * Remember which program register a given hardware constant
+        * belongs to.
         */
-       const GLfloat *constant[PFS_NUM_CONST_REGS];
+       struct prog_src_register constant[PFS_NUM_CONST_REGS];
        int const_nr;
 
        int max_temp_idx;
@@ -787,14 +784,11 @@ struct r500_fragment_program_code {
        int inst_offset;
        int inst_end;
 
-       /* Hardware constants.
-       * Contains a pointer to the value. The destination of the pointer
-       * is supposed to be updated when GL state changes.
-       * Typically, this is either a pointer into
-       * gl_program_parameter_list::ParameterValues, or a pointer to a
-       * global constant (e.g. for sin/cos-approximation)
-       */
-       const GLfloat *constant[PFS_NUM_CONST_REGS];
+       /**
+        * Remember which program register a given hardware constant
+        * belongs to.
+        */
+       struct prog_src_register constant[PFS_NUM_CONST_REGS];
        int const_nr;
 
        int max_temp_idx;
index 6a8ef0ef5fc3be5549d75b0b4c9291f47bd3ebc1..57987f5d0fcd0f7538320d6fd9559c91a7130d9b 100644 (file)
@@ -117,9 +117,7 @@ static GLboolean transform_TEX(
                int factor_index;
 
                tokens[2] = inst.TexSrcUnit;
-               factor_index =
-                       _mesa_add_state_reference(
-                               compiler->fp->mesa_program.Base.Parameters, tokens);
+               factor_index = _mesa_add_state_reference(t->Program->Parameters, tokens);
 
                tgt = radeonAppendInstructions(t->Program, 1);
 
@@ -303,7 +301,7 @@ static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
        i++;
 
        /* viewport transformation */
-       window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens);
+       window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
 
        fpi[i].Opcode = OPCODE_MAD;
 
@@ -401,6 +399,11 @@ void r300TranslateFragmentShader(r300ContextPtr r300,
                compiler.code = &fp->code;
                compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
 
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       _mesa_printf("Fragment Program: Initial program:\n");
+                       _mesa_print_program(compiler.program);
+               }
+
                insert_WPOS_trailer(&compiler);
 
                struct radeon_program_transformation transformations[] = {
@@ -413,13 +416,18 @@ void r300TranslateFragmentShader(r300ContextPtr r300,
                        2, transformations);
 
                if (RADEON_DEBUG & DEBUG_PIXEL) {
-                       _mesa_printf("Program after transformations:\n");
+                       _mesa_printf("Fragment Program: After transformations:\n");
                        _mesa_print_program(compiler.program);
                }
 
                if (!r300FragmentProgramEmit(&compiler))
                        fp->error = GL_TRUE;
 
+               /* Subtle: Rescue any parameters that have been added during transformations */
+               _mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
+               fp->mesa_program.Base.Parameters = compiler.program->Parameters;
+               compiler.program->Parameters = 0;
+
                _mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL);
 
                if (!fp->error)
index 889631f70541e7f5aae8cd7a4eca8e432f03558f..d95008edc0612408afed9f0e2fd4d6a1d1bbff35 100644 (file)
@@ -549,22 +549,17 @@ static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
 
 /**
  * Emit a hardware constant/parameter.
- *
- * \p cp Stable pointer to an array of 4 floats.
- *  The pointer must be stable in the sense that it remains to be valid
- *  and hold the contents of the constant/parameter throughout the lifetime
- *  of the fragment program (actually, up until the next time the fragment
- *  program is translated).
  */
 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
-                           const GLfloat * cp)
+                           struct prog_src_register srcreg)
 {
        COMPILE_STATE;
        GLuint reg = undef;
        int index;
 
        for (index = 0; index < code->const_nr; ++index) {
-               if (code->constant[index] == cp)
+               if (code->constant[index].File == srcreg.File &&
+                   code->constant[index].Index == srcreg.Index)
                        break;
        }
 
@@ -575,7 +570,7 @@ static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
                }
 
                code->const_nr++;
-               code->constant[index] = cp;
+               code->constant[index] = srcreg;
        }
 
        REG_SET_TYPE(reg, REG_TYPE_CONST);
@@ -806,20 +801,11 @@ static GLuint t_src(struct r300_pfs_compile_state *cs,
                REG_SET_TYPE(r, REG_TYPE_INPUT);
                break;
        case PROGRAM_LOCAL_PARAM:
-               r = emit_const4fv(cs,
-                                 fp->mesa_program.Base.LocalParams[fpsrc.
-                                                                   Index]);
-               break;
        case PROGRAM_ENV_PARAM:
-               r = emit_const4fv(cs,
-                       cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
-               break;
        case PROGRAM_STATE_VAR:
        case PROGRAM_NAMED_PARAM:
        case PROGRAM_CONSTANT:
-               r = emit_const4fv(cs,
-                                 fp->mesa_program.Base.Parameters->
-                                 ParameterValues[fpsrc.Index]);
+               r = emit_const4fv(cs, fpsrc);
                break;
        case PROGRAM_BUILTIN:
                switch(fpsrc.Swizzle) {
@@ -1452,100 +1438,17 @@ static GLfloat SinCosConsts[2][4] = {
         }
 };
 
-/**
- * Emit a LIT instruction.
- * \p flags may be PFS_FLAG_SAT
- *
- * Definition of LIT (from ARB_fragment_program):
- * tmp = VectorLoad(op0);
- * if (tmp.x < 0) tmp.x = 0;
- * if (tmp.y < 0) tmp.y = 0;
- * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
- * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
- * result.x = 1.0;
- * result.y = tmp.x;
- * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
- * result.w = 1.0;
- *
- * The longest path of computation is the one leading to result.z,
- * consisting of 5 operations. This implementation of LIT takes
- * 5 slots. So unless there's some special undocumented opcode,
- * this implementation is potentially optimal. Unfortunately,
- * emit_arith is a bit too conservative because it doesn't understand
- * partial writes to the vector component.
- */
-static const GLfloat LitConst[4] =
-    { 127.999999, 127.999999, 127.999999, -127.999999 };
-
-static void emit_lit(struct r300_pfs_compile_state *cs,
-                    GLuint dest, int mask, GLuint src, int flags)
+static GLuint emit_sincosconsts(struct r300_pfs_compile_state *cs, int i)
 {
-       COMPILE_STATE;
-       GLuint cnst;
-       int needTemporary;
-       GLuint temp;
-
-       cnst = emit_const4fv(cs, LitConst);
-
-       needTemporary = 0;
-       if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
-               needTemporary = 1;
-       } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-               // LIT is typically followed by DP3/DP4, so there's no point
-               // in creating special code for this case
-               needTemporary = 1;
-       }
+       struct prog_src_register srcreg;
+       GLuint constant_swizzle;
 
-       if (needTemporary) {
-               temp = keep(get_temp_reg(cs));
-       } else {
-               temp = keep(dest);
-       }
+       srcreg.File = PROGRAM_CONSTANT;
+       srcreg.Index = _mesa_add_unnamed_constant(cs->compiler->program->Parameters,
+               SinCosConsts[i], 4, &constant_swizzle);
+       srcreg.Swizzle = constant_swizzle;
 
-       // Note: The order of emit_arith inside the slots is relevant,
-       // because emit_arith only looks at scalar vs. vector when resolving
-       // dependencies, and it does not consider individual vector components,
-       // so swizzling between the two parts can create fake dependencies.
-
-       // First slot
-       emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
-                  keep(src), pfs_zero, undef, 0);
-       emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
-
-       // Second slot
-       emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
-                  swizzle(temp, W, W, W, W), cnst, undef, 0);
-       emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
-                  swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
-
-       // Third slot
-       // If desired, we saturate the y result here.
-       // This does not affect the use as a condition variable in the CMP later
-       emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
-                  temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
-       emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
-                  swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
-
-       // Fourth slot
-       emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
-                  pfs_one, pfs_one, pfs_zero, 0);
-       emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
-
-       // Fifth slot
-       emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
-                  pfs_zero, swizzle(temp, W, W, W, W),
-                  negate(swizzle(temp, Y, Y, Y, Y)), flags);
-       emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
-                  pfs_zero, 0);
-
-       if (needTemporary) {
-               emit_arith(cs, PFS_OP_MAD, dest, mask,
-                          temp, pfs_one, pfs_zero, flags);
-               free_temp(cs, temp);
-       } else {
-               // Decrease refcount of the destination
-               t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
-       }
+       return emit_const4fv(cs, srcreg);
 }
 
 static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
@@ -1577,8 +1480,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst
                src[1] = t_src(cs, fpi->SrcReg[1]);
                src[2] = t_src(cs, fpi->SrcReg[2]);
                /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
-                       *    r300 - if src2.c < 0.0 ? src1.c : src0.c
-                       */
+                *    r300 - if src2.c < 0.0 ? src1.c : src0.c
+                */
                emit_arith(cs, PFS_OP_CMP, dest, mask,
                                src[2], src[1], src[0], flags);
                break;
@@ -1592,8 +1495,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst
                        *   result = sin(x)
                        */
                temp[0] = get_temp_reg(cs);
-               const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
-               const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+               const_sin[0] = emit_sincosconsts(cs, 0);
+               const_sin[1] = emit_sincosconsts(cs, 1);
                src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
 
                /* add 0.5*PI and do range reduction */
@@ -1687,10 +1590,6 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst
                emit_arith(cs, PFS_OP_LG2, dest, mask,
                                src[0], undef, undef, flags);
                break;
-       case OPCODE_LIT:
-               src[0] = t_src(cs, fpi->SrcReg[0]);
-               emit_lit(cs, dest, mask, src[0], flags);
-               break;
        case OPCODE_LRP:
                src[0] = t_src(cs, fpi->SrcReg[0]);
                src[1] = t_src(cs, fpi->SrcReg[1]);
@@ -1758,8 +1657,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst
                        */
                temp[0] = get_temp_reg(cs);
                temp[1] = get_temp_reg(cs);
-               const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
-               const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+               const_sin[0] = emit_sincosconsts(cs, 0);
+               const_sin[1] = emit_sincosconsts(cs, 1);
                src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
 
                /* x = -abs(x)+0.5*PI */
@@ -1825,8 +1724,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst
                        */
 
                temp[0] = get_temp_reg(cs);
-               const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
-               const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+               const_sin[0] = emit_sincosconsts(cs, 0);
+               const_sin[1] = emit_sincosconsts(cs, 1);
                src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
 
                /* do range reduction */
index 0f7c179de84c203f742b61425e11eb36ddda058d..d7a6962acc857dc3c2784946b9a5437e79c48505 100644 (file)
@@ -2453,6 +2453,27 @@ void r300UpdateShaders(r300ContextPtr rmesa)
        r300UpdateStateParameters(ctx, _NEW_PROGRAM);
 }
 
+static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
+       struct gl_program *program, struct prog_src_register srcreg)
+{
+       static const GLfloat dummy[4] = { 0, 0, 0, 0 };
+
+       switch(srcreg.File) {
+       case PROGRAM_LOCAL_PARAM:
+               return program->LocalParams[srcreg.Index];
+       case PROGRAM_ENV_PARAM:
+               return ctx->FragmentProgram.Parameters[srcreg.Index];
+       case PROGRAM_STATE_VAR:
+       case PROGRAM_NAMED_PARAM:
+       case PROGRAM_CONSTANT:
+               return program->Parameters->ParameterValues[srcreg.Index];
+       default:
+               _mesa_problem(ctx, "get_fragmentprogram_constant: Unknown\n");
+               return dummy;
+       }
+}
+
+
 static void r300SetupPixelShader(r300ContextPtr rmesa)
 {
        GLcontext *ctx = rmesa->radeon.glCtx;
@@ -2523,10 +2544,12 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
        R300_STATECHANGE(rmesa, fpp);
        rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
        for (i = 0; i < code->const_nr; i++) {
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(code->constant[i][0]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(code->constant[i][1]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(code->constant[i][2]);
-               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(code->constant[i][3]);
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(constant[3]);
        }
 }
 
@@ -2595,10 +2618,12 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
 
        R300_STATECHANGE(rmesa, r500fp_const);
        for (i = 0; i < code->const_nr; i++) {
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(code->constant[i][0]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(code->constant[i][1]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(code->constant[i][2]);
-               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(code->constant[i][3]);
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
        }
        bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
 
index 7ee84947225c7ec02af919545b77d3510ef3135d..1cdb065354be61f4ddde8e7c3b05a2f09272faeb 100644 (file)
@@ -212,7 +212,7 @@ static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
        i++;
 
        /* viewport transformation */
-       window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens);
+       window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
 
        fpi[i].Opcode = OPCODE_MAD;
 
@@ -332,6 +332,11 @@ void r500TranslateFragmentShader(r300ContextPtr r300,
 
                fp->translated = r500FragmentProgramEmit(&compiler);
 
+               /* Subtle: Rescue any parameters that have been added during transformations */
+               _mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
+               fp->mesa_program.Base.Parameters = compiler.program->Parameters;
+               compiler.program->Parameters = 0;
+
                _mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0);
 
                r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
@@ -461,9 +466,8 @@ static void dump_program(struct r500_fragment_program_code *code)
   if (code->const_nr) {
     fprintf(stderr, "--------\nConstants:\n");
     for (n = 0; n < code->const_nr; n++) {
-      fprintf(stderr, "Constant %d: %f %f\n\t %f %f\n", n,
-        code->constant[n][0], code->constant[n][1], code->constant[n][2],
-        code->constant[n][3]);
+      fprintf(stderr, "Constant %d: %i[%i]\n", n,
+        code->constant[n].File, code->constant[n].Index);
     }
     fprintf(stderr, "--------\n");
   }
index 0e95c81e482606a2886f911a2c39ebed44ed5dad..c79bff96bd4df2870bf886b976676724f7fbcfde 100644 (file)
@@ -266,7 +266,7 @@ static int get_temp(struct r500_pfs_compile_state *cs, int slot) {
 
 /* Borrowed verbatim from r300_fragprog since it hasn't changed. */
 static GLuint emit_const4fv(struct r500_pfs_compile_state *cs,
-                           const GLfloat * cp)
+                           struct prog_src_register srcreg)
 {
        PROG_CODE;
 
@@ -274,7 +274,8 @@ static GLuint emit_const4fv(struct r500_pfs_compile_state *cs,
        int index;
 
        for (index = 0; index < code->const_nr; ++index) {
-               if (code->constant[index] == cp)
+               if (code->constant[index].File == srcreg.File &&
+                   code->constant[index].Index == srcreg.Index)
                        break;
        }
 
@@ -285,7 +286,7 @@ static GLuint emit_const4fv(struct r500_pfs_compile_state *cs,
                }
 
                code->const_nr++;
-               code->constant[index] = cp;
+               code->constant[index] = srcreg;
        }
 
        reg = index | REG_CONSTANT;
@@ -303,18 +304,11 @@ static GLuint make_src(struct r500_pfs_compile_state *cs, struct prog_src_regist
                reg = cs->inputs[src.Index].reg;
                break;
        case PROGRAM_LOCAL_PARAM:
-               reg = emit_const4fv(cs,
-                       cs->compiler->fp->mesa_program.Base.LocalParams[src.Index]);
-               break;
        case PROGRAM_ENV_PARAM:
-               reg = emit_const4fv(cs,
-                       cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[src.Index]);
-               break;
        case PROGRAM_STATE_VAR:
        case PROGRAM_NAMED_PARAM:
        case PROGRAM_CONSTANT:
-               reg = emit_const4fv(cs,
-                       cs->compiler->fp->mesa_program.Base.Parameters->ParameterValues[src.Index]);
+               reg = emit_const4fv(cs, src);
                break;
        case PROGRAM_BUILTIN:
                reg = 0x0;
@@ -628,12 +622,20 @@ static void emit_trig(struct r500_pfs_compile_state *cs, struct prog_instruction
        temp.Index = get_temp(cs, 0);
        temp.WriteMask = WRITEMASK_W;
 
+       struct prog_src_register srcreg;
+       GLuint constant_swizzle;
+
+       srcreg.File = PROGRAM_CONSTANT;
+       srcreg.Index = _mesa_add_unnamed_constant(cs->compiler->program->Parameters,
+               RCP_2PI, 4, &constant_swizzle);
+       srcreg.Swizzle = constant_swizzle;
+
        /* temp = Input*(1/2pi) */
        ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, temp);
        set_src0(cs, ip, fpi->SrcReg[0]);
-       set_src1_direct(cs, ip, emit_const4fv(cs, RCP_2PI));
+       set_src1(cs, ip, srcreg);
        set_argA(cs, ip, 0, R500_SWIZ_RGB_ZERO, make_sop_swizzle(fpi->SrcReg[0]));
-       set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, SWIZZLE_W);
+       set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, make_alpha_swizzle(srcreg));
        set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO);
 
        /* temp = frac(dst) */
@@ -660,87 +662,6 @@ static void emit_trig(struct r500_pfs_compile_state *cs, struct prog_instruction
        }
 }
 
-/**
- * Emit a LIT instruction.
- *
- * Definition of LIT (from ARB_fragment_program):
- *  tmp = VectorLoad(op0);
- *  if (tmp.x < 0) tmp.x = 0;
- *  if (tmp.y < 0) tmp.y = 0;
- *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
- *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
- *  result.x = 1.0;
- *  result.y = tmp.x;
- *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
- *  result.w = 1.0;
- */
-static void emit_lit(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi)
-{
-       GLuint cnst;
-       int needTemporary;
-       GLuint temp;
-       int ip;
-
-       cnst = emit_const4fv(cs, LIT);
-
-       needTemporary = 0;
-       if (fpi->DstReg.WriteMask != WRITEMASK_XYZW || fpi->DstReg.File == PROGRAM_OUTPUT)
-               needTemporary = 1;
-
-       if (needTemporary) {
-               temp = get_temp(cs, 0);
-       } else {
-               temp = fpi->DstReg.Index;
-       }
-
-       // MAX tmp.xyw, op0, { 0, 0, 0, -128+eps }
-       ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAX, R500_ALPHA_OP_MAX, temp, WRITEMASK_XYW);
-       set_src0(cs, ip, fpi->SrcReg[0]);
-       set_src1_direct(cs, ip, cnst);
-       set_argA_reg(cs, ip, 0, fpi->SrcReg[0]);
-       set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, SWIZZLE_W);
-
-       // MIN tmp.z, tmp.w, { 128-eps }
-       // LG2 tmp.w, tmp.y
-       ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MIN, R500_ALPHA_OP_LN2, temp, WRITEMASK_ZW);
-       set_src0_direct(cs, ip, temp);
-       set_src1_direct(cs, ip, cnst);
-       set_argA(cs, ip, 0, SWIZZLE_W | (SWIZZLE_W<<3) | (SWIZZLE_W<<6), SWIZZLE_Y);
-       set_argB(cs, ip, 1, SWIZZLE_X | (SWIZZLE_X<<3) | (SWIZZLE_X<<6), SWIZZLE_X);
-
-       // MOV tmp.y, tmp.x
-       // MUL tmp.w, tmp.z, tmp.w
-       ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, temp, WRITEMASK_YW);
-       set_src0_direct(cs, ip, temp);
-       set_argA(cs, ip, 0, SWIZZLE_X | (SWIZZLE_X<<3) | (SWIZZLE_X<<6), SWIZZLE_Z);
-       set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, SWIZZLE_W);
-       set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO);
-
-       // MOV tmp.x, 1.0
-       // EX2 tmp.w, tmp.w
-       ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_EX2, temp, WRITEMASK_XW);
-       set_src0_direct(cs, ip, temp);
-       set_argA(cs, ip, 0, R500_SWIZ_RGB_ONE, SWIZZLE_W);
-       set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ZERO);
-       set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO);
-
-       // tmp.z := (-tmp.x >= 0) ? tmp.y : 0.0
-       // MOV tmp.w, 1.0
-       ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, temp, WRITEMASK_ZW);
-       set_src0_direct(cs, ip, temp);
-       set_argA(cs, ip, 0, R500_SWIZZLE_ZERO, R500_SWIZZLE_ONE);
-       set_argB(cs, ip, 0, SWIZZLE_W | (SWIZZLE_W<<3) | (SWIZZLE_W<<6), R500_SWIZZLE_ONE);
-       set_argC(cs, ip, 0, SWIZZLE_Y | (SWIZZLE_Y<<3) | (SWIZZLE_Y<<6) | (R500_SWIZ_MOD_NEG<<9), R500_SWIZZLE_ZERO);
-
-       if (needTemporary) {
-               ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg);
-               set_src0_direct(cs, ip, temp);
-               set_argA(cs, ip, 0, R500_SWIZ_RGB_RGB, SWIZZLE_W);
-               set_argB(cs, ip, 1, R500_SWIZ_RGB_RGB, SWIZZLE_W);
-               set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO);
-       }
-}
-
 static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) {
        PROG_CODE;
        GLuint src[3], dest = 0;
@@ -830,9 +751,6 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *
                        src[0] = make_src(cs, fpi->SrcReg[0]);
                        emit_sop(cs, R500_ALPHA_OP_LN2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0]));
                        break;
-               case OPCODE_LIT:
-                       emit_lit(cs, fpi);
-                       break;
                case OPCODE_LRP:
                        /* result = src0*src1 + (1-src0)*src2
                         *        = src0*src1 + src2 + (-src0)*src2
index d6d016d7c12c6e47de4d7d8b380a5346cbe24368..85ea810523f4381cea521fb309fed6a4027ee038 100644 (file)
@@ -35,6 +35,8 @@
 
 #include "radeon_program_alu.h"
 
+#include "shader/prog_parameter.h"
+
 
 static struct prog_instruction *emit1(struct gl_program* p,
        gl_inst_opcode Opcode, struct prog_dst_register DstReg,
@@ -101,6 +103,19 @@ static struct prog_dst_register dstreg(int file, int index)
        return dst;
 }
 
+static struct prog_dst_register dstregtmpmask(int index, int mask)
+{
+       struct prog_dst_register dst;
+       dst.File = PROGRAM_TEMPORARY;
+       dst.Index = index;
+       dst.WriteMask = mask;
+       dst.CondMask = COND_TR;
+       dst.CondSwizzle = SWIZZLE_NOOP;
+       dst.CondSrc = 0;
+       dst.pad = 0;
+       return dst;
+}
+
 static const struct prog_src_register builtin_zero = {
        .File = PROGRAM_BUILTIN,
        .Index = 0,
@@ -125,6 +140,15 @@ static struct prog_src_register srcreg(int file, int index)
        return src;
 }
 
+static struct prog_src_register srcregswz(int file, int index, int swz)
+{
+       struct prog_src_register src = srcreg_undefined;
+       src.File = file;
+       src.Index = index;
+       src.Swizzle = swz;
+       return src;
+}
+
 static struct prog_src_register negate(struct prog_src_register reg)
 {
        struct prog_src_register newreg = reg;
@@ -136,10 +160,10 @@ static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x,
 {
        struct prog_src_register swizzled = reg;
        swizzled.Swizzle = MAKE_SWIZZLE4(
-               GET_SWZ(reg.Swizzle, x),
-               GET_SWZ(reg.Swizzle, y),
-               GET_SWZ(reg.Swizzle, z),
-               GET_SWZ(reg.Swizzle, w));
+               x >= 4 ? x : GET_SWZ(reg.Swizzle, x),
+               y >= 4 ? y : GET_SWZ(reg.Swizzle, y),
+               z >= 4 ? z : GET_SWZ(reg.Swizzle, z),
+               w >= 4 ? w : GET_SWZ(reg.Swizzle, w));
        return swizzled;
 }
 
@@ -185,6 +209,93 @@ static void transform_FLR(struct radeon_transform_context* t,
        emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
 }
 
+/**
+ * Definition of LIT (from ARB_fragment_program):
+ *
+ *  tmp = VectorLoad(op0);
+ *  if (tmp.x < 0) tmp.x = 0;
+ *  if (tmp.y < 0) tmp.y = 0;
+ *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ *  result.x = 1.0;
+ *  result.y = tmp.x;
+ *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ *  result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots, if the subsequent optimization passes are clever enough
+ * to pair instructions correctly.
+ */
+static void transform_LIT(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       static const GLfloat LitConst[4] = { -127.999999 };
+
+       GLuint constant;
+       GLuint constant_swizzle;
+       GLuint temp;
+       int needTemporary = 0;
+       struct prog_src_register srctemp;
+
+       constant = _mesa_add_unnamed_constant(t->Program->Parameters, LitConst, 1, &constant_swizzle);
+
+       if (inst->DstReg.WriteMask != WRITEMASK_XYZW) {
+               needTemporary = 1;
+       } else if (inst->DstReg.File != PROGRAM_TEMPORARY) {
+               // LIT is typically followed by DP3/DP4, so there's no point
+               // in creating special code for this case
+               needTemporary = 1;
+       }
+
+       if (needTemporary) {
+               temp = radeonFindFreeTemporary(t);
+       } else {
+               temp = inst->DstReg.Index;
+       }
+       srctemp = srcreg(PROGRAM_TEMPORARY, temp);
+
+       // tmp.x = max(0.0, Src.x);
+       // tmp.y = max(0.0, Src.y);
+       // tmp.w = clamp(Src.z, -128+eps, 128-eps);
+       emit2(t->Program, OPCODE_MAX,
+               dstregtmpmask(temp, WRITEMASK_XYW),
+               inst->SrcReg[0],
+               swizzle(srcreg(PROGRAM_CONSTANT, constant),
+                       SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3));
+       emit2(t->Program, OPCODE_MIN,
+               dstregtmpmask(temp, WRITEMASK_Z),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+               negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)));
+
+       // tmp.w = Pow(tmp.y, tmp.w)
+       emit1(t->Program, OPCODE_LG2,
+               dstregtmpmask(temp, WRITEMASK_W),
+               swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
+       emit2(t->Program, OPCODE_MUL,
+               dstregtmpmask(temp, WRITEMASK_W),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+               swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z));
+       emit1(t->Program, OPCODE_EX2,
+               dstregtmpmask(temp, WRITEMASK_W),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+
+       // tmp.z = (tmp.x > 0) ? tmp.w : 0.0
+       emit3(t->Program, OPCODE_CMP,
+               dstregtmpmask(temp, WRITEMASK_Z),
+               negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+               builtin_zero);
+
+       // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
+       emit1(t->Program, OPCODE_MOV,
+               dstregtmpmask(temp, WRITEMASK_XYW),
+               swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE));
+
+       if (needTemporary)
+               emit1(t->Program, OPCODE_MOV, inst->DstReg, srctemp);
+}
+
 static void transform_POW(struct radeon_transform_context* t,
        struct prog_instruction* inst)
 {
@@ -249,13 +360,11 @@ static void transform_XPD(struct radeon_transform_context* t,
  * no userData necessary.
  *
  * Eliminates the following ALU instructions:
- *  ABS, DPH, FLR, POW, SGE, SLT, SUB, SWZ, XPD
+ *  ABS, DPH, FLR, LIT, POW, SGE, SLT, SUB, SWZ, XPD
  * using:
  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
  *
  * @note should be applicable to R300 and R500 fragment programs.
- *
- * @todo add LIT here as well?
  */
 GLboolean radeonTransformALU(struct radeon_transform_context* t,
        struct prog_instruction* inst,
@@ -265,6 +374,7 @@ GLboolean radeonTransformALU(struct radeon_transform_context* t,
        case OPCODE_ABS: transform_ABS(t, inst); return GL_TRUE;
        case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE;
        case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE;
+       case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE;
        case OPCODE_POW: transform_POW(t, inst); return GL_TRUE;
        case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE;
        case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE;